def save_sms_sources(request): ''' Парсер файла sms.txt ''' # получаю объект для русского языка lang = Language.objects.get(pk=1) list_all = [] sms_file = open(rel('sources/sms.txt'), 'r') # получаю данные из файла for line in sms_file.read().split('\n'): listt = [] for i, l in enumerate(line.split('\t')): if i == 1: listt.append(capit(low(l))) elif i == 2: listt.append(l.split(' ')[0]) elif i == 4: listt.append(l) # складываю данные в список list_all.append(listt) sms_file.close() # получаю объект для источника sms source = ImportSources.objects.get(source='SMS') # иду по списку с данными for l in list_all: try: if l[1] != 'ЗАКРЫТ': # очищаю название от спец.символов slug_city = low(del_separator(l[1])) # ищу по очищенному названию try: city = City.objects.get(name__name=slug_city) except City.DoesNotExist: # если не найдено, то ищу по названию из источника try: city = City.objects.get(name__name=l[1]) except City.DoesNotExist: # если не найдено, то ищу по названию из источника в нижнем регистре try: city = City.objects.get(name__name=capit(low(l[1]))) except City.DoesNotExist: city = None if city: # очищаю название от спец.символов slug_cinema = low(del_separator(l[0])) # ищу по очищенному названию try: cinema = Cinema.objects.get(name__name=slug_cinema, city=city.id) except Cinema.DoesNotExist: # если не найдено, то ищу по названию из источника try: cinema = Cinema.objects.get(name__name=l[0], city=city.id) except Cinema.DoesNotExist: cinema = None if cinema: # получаю/создаю залы для этого кинотеатра в этом городе name1 = create_hallname(1, lang, 'без указания зала') name2 = create_hallname(2, lang, 'безуказаниязала') hall = create_hall((name1, name2), 0, 0, cinema) # записываю url источника в БД, для последующего получения данных о сеансах try: HallsSources.objects.get(id_hall=hall, source=source, url_hall_sources=l[2]) except HallsSources.DoesNotExist: HallsSources(id_hall=hall, source=source, url_hall_sources=l[2]).save() else: # если не найден кинотеатр, то запись в лог logger(**{'event': 2, 'code': 2, 'bad_obj': l[0], 'obj1': l[1], 'obj2': l[2], 'extra': city.id}) else: # если не найден город, то запись в лог logger(**{'event': 2, 'code': 1, 'bad_obj': capit(low(l[1])), 'obj2': l[2]}) except IndexError: pass return HttpResponseRedirect(reverse("main_kai"))
def person_create_func(name_ru, parental, name_en): person_obj = AfishaPersons.objects.using('afisha').create( birth_year = 0, birth_mounth = 0, birth_day = 0, male = 0, national = 0, country_id = 0, imdb = 0 ) person = Person.objects.create(kid = person_obj.id) names_list = [ {'name': name_ru.strip(), 'status': 1, 'lang': 1}, {'name': low(del_separator(name_ru.strip().encode('utf-8'))), 'status': 2, 'lang': 1}, {'name': name_en.strip(), 'status': 1, 'lang': 2}, {'name': low(del_separator(name_en.strip().encode('utf-8'))), 'status': 2, 'lang': 2}, {'name': parental.strip(), 'status': 3, 'lang': 1}, ] for i in names_list: if i['name']: if i['status'] == 1: try: afisha_person_name_create(person_obj, i['name'], i['lang']) except db.backend.Database._mysql.OperationalError: i['name'] = i['name'].encode('ascii', 'xmlcharrefreplace') afisha_person_name_create(person_obj, i['name'], i['lang']) name, created = person_name_create(i['name'], i['lang'], i['status']) person.name.add(name) return person_obj
def nowru_ident(): source = ImportSources.objects.get(url='http://www.now.ru/') ignored = get_ignored_films() data_nof_film = '' nowru_data = Nowru.objects.filter(kid=None) for i in nowru_data: name_ru_slug = low(del_separator(i.name_ru.encode('utf-8'))) if name_ru_slug.decode('utf-8') not in ignored: name_en_slug = low(del_separator(i.name_en.encode('utf-8'))) kid, info = film_identification(name_ru_slug, name_en_slug, {}, {}, year=i.year, source=source) if kid: i.kid = kid i.save() else: if 'slug="%s"' % name_ru_slug not in data_nof_film: name_ru = i.name_ru.encode('utf-8') name_en = i.name_en.encode('utf-8') data_nof_film += xml_noffilm(name_ru, name_ru_slug, name_en, name_en_slug, i.nowru_id, info, None, source.id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'players', 'Онлайн плееры')
def raspishi_relations(): source = ImportSources.objects.get(url='http://распиши.рф/') ignored = get_ignored_films() data_nof_film = '' domain = u'распиши.рф' url = 'http://%s/getfilmxml.php' % domain.encode('idna') req = urllib.urlopen(url) if req.getcode() == 200: films_rid = list( RaspishiRelations.objects.exclude(kid=0).values_list('rid', flat=True)) xml_data = BeautifulSoup(req.read(), from_encoding="utf-8") for i in xml_data.findAll('movie'): id = int(i['id']) if id not in films_rid: name_ru = i.find('name').text.encode('utf-8') name_en = i.find('nameeng').text.encode('utf-8') name_ru = re.sub(r'\(.*?\)', '', name_ru).strip() name_en = re.sub(r'\(.*?\)', '', name_en).strip() name_slug = low(del_separator(del_screen_type(name_ru))) name_en_slug = low(del_separator(del_screen_type(name_en))) if name_slug.decode('utf-8') not in ignored: try: kid, info = film_identification(name_slug, None, {}, {}, source=source) if kid: created = RaspishiRelations.objects.create( rid=id, kid=kid, name_ru=name_ru, name_en=name_en, ) else: data_nof_film += xml_noffilm( name_ru, name_slug, name_en, name_en_slug, id, info, None, source.id) except db.backend.Database._mysql.OperationalError: pass create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('xml', source.dump, 'films', 'Укр. сеансы')
def get_cinemate_cc_film(data, source, ignored, noffilms): flist = [] for div in data.findAll('div', {'class': "movie-brief"}): h3 = div.find('h3') a = h3.find('a') film_url = a.get('href') film_id = int(film_url.replace('/movie/', '').replace('/', '')) film_name = a.text.encode('utf-8') film_slug = low(del_separator(film_name)) if film_slug.decode( 'utf-8') not in ignored and film_id not in noffilms: full_url = '%s%s' % (source.url, film_url.lstrip('/')) film_year = int( h3.find('small').text.encode('utf-8').replace('(', '').replace( ')', '')) next = False ul = div.find('ul') for link in ul.findAll('a'): a_txt = link.text.encode('utf-8').strip() if a_txt == 'Скачать': next = True if next: flist.append({ 'id': film_id, 'name': film_name, 'slug': film_slug, 'year': film_year, 'url': full_url, }) return flist
def imdb_search2(imdb_id, name, year, kid): film_name = name slug = low(del_separator(film_name.encode('utf-8'))) film_name = film_name.encode('ascii', 'xmlcharrefreplace') xml = '<film n="%s" s="%s" y="%s" id="%s" d="" r=""></film>' % (film_name, slug, str(year).encode('utf-8'), str(imdb_id).encode('utf-8')) data = exp_film_data(imdb_id) if data: if data.get('double'): return simplejson.dumps(data) else: if not data['kid']: pass elif int(data['kid']) != int(kid): return simplejson.dumps({'status': True, 'redirect': True, 'kid': data['kid']}) data_nof_persons, distr_nof_data, dump, good = get_imdb_data(xml, False, 1, [int(imdb_id),], True, kid) if good: data = exp_film_data(imdb_id) if not data: data = {'status': False} else: data = {'status': False} if kid: cache.delete_many(['get_film__%s' % kid, 'film__%s__fdata' % kid]) return simplejson.dumps(data)
def imdb_film_ident(): source = ImportSources.objects.get(url='http://www.imdb.com/') films = Films.objects.filter(kid=None) films_ids = [i.imdb_id for i in films] exist_films = Film.objects.using('afisha').filter(idalldvd__in=films_ids) exist_ids = {} for i in exist_films: exist_ids[i.idalldvd] = i.id data_nof_film = '' for i in films: name = None for j in i.name.filter(status=1, language__id=2): name = j.name.encode('utf-8') slug = low(del_separator(name)) kid = exist_ids.get(long(i.imdb_id)) if kid: i.kid = kid i.save() else: full_url = '%stitle/tt%s/' % (source.url, i.imdb_id) data_nof_film += xml_noffilm(name, slug, None, None, i.imdb_id, 'Фильм не найден', full_url.encode('utf-8'), source.id) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'films_ident', 'Идентификация')
def person_name_detect(request, ru, en): try: film_editor = is_film_editor(request) if film_editor: name = escape(strip_tags(ru)).encode('utf-8').strip() en = escape(strip_tags(en)).encode('utf-8').strip() slug_ru = low(del_separator(name)) slug_en = low(del_separator(en)) queries = [] if name: queries.append(Q(name__icontains=slug_ru, status=1)) if en: queries.append(Q(name__icontains=en, status=1)) query = queries.pop() for item in queries: query |= item data = list(NamePerson.objects.filter(query, language__id__in=(1,2), person__kid__gt=0).values('language', 'person__kid', 'name')) names = {} for i in data: if not names.get(i['person__kid']): names[i['person__kid']] = {'ru': '', 'en': '', 'id': i['person__kid']} if i['language'] == 1: names[i['person__kid']]['ru'] = i['name'] elif i['language'] == 2: names[i['person__kid']]['en'] = i['name'] names = sorted(names.values(), key=operator.itemgetter('ru')) txt = '' for i in names: txt += u'<div style="border-bottom:1px solid #CCC; padding:5px; background:#EBEBEB; min-width: 300px;"><a href="http://kinoinfo.ru/person/%s/" target="_blank">%s / %s</a></div>' % (i['id'], i['ru'], i['en']) if txt: txt = u'В базе есть похожие персоны:<br />%s' % txt return simplejson.dumps({ 'status': True, 'content': txt, }) except Exception as e: open('errors.txt','a').write('%s * (%s)' % (dir(e), e.args))
def get_imdb_film_list(): source = ImportSources.objects.get(url='http://www.imdb.com/') url = '%scalendar/?region=us' % source.url opener = give_me_cookie() req = opener.open(urllib2.Request(url)) xml = '' ids = [] if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") div = data.find('div', id="main") old_date = '' for h4 in div.findAll('h4'): release = h4.string.encode('utf-8') day, month, year = release.split() month = get_month_en(low(month)) rel_date = '%s-%s-%s' % (year, month, day) xml += '<date v="%s">' % rel_date ul = h4.find_next('ul') for li in ul.findAll('li'): year = li.find('span', {'class': "year_type"}).string.encode('utf-8') if 'documentary' not in low(year): year = re.findall(r'\d+', year) if year: details = li.find('i') if details: details = str(details).encode('utf-8').replace('<i>','').replace('</i>','') details = details.replace('(','').replace(')','') else: details = '' if 'limited' not in low(details) and 'fest' not in low(details) or 'tv premiere' not in low(details): film_name = li.a.string.encode('utf-8').replace('"', '"').replace('&','&') film_slug = low(del_separator(film_name)) full_url = li.a.get('href').encode('utf-8') imdb_id = full_url.replace('/title/tt', '').replace('/', '') xml += '<film n="%s" s="%s" y="%s" id="%s" d="%s" r="%s"></film>' % (film_name, film_slug, year[0], imdb_id, details, rel_date) ids.append(imdb_id) xml += '</date>' ids = ';'.join(set(ids)) xml = '<data><ids value="%s">%s</ids></data>' % (ids, xml)time create_dump_file('%s_film_list' % source.dump, settings.API_DUMP_PATH, xml) cron_success('html', source.dump, 'films_list', 'Список релизов')
def imdb_person_search(request, pid, name, exist): try: from person.views import person_name_create from release_parser.imdb import imdb_person_searching if request.user.is_superuser: name = escape(strip_tags(name)).encode('utf-8').strip() slug = low(del_separator(name)) person = Person.objects.get(pk=pid) # если не было имени (en), то создаю if not exist: if name: exist = True person_names = person.name.all() names = [ {'name': name, 'status': 1}, {'name': slug, 'status': 2}, ] for i in names: name_obj, created = person_name_create(i['name'], i['status'], 2) if name_obj not in person_names: person.name.add(name_obj) if exist: result = imdb_person_searching(name) txt = '' for i in result: txt += '<div style="border-bottom:1px solid #CCC; padding:5px; background:#EBEBEB; min-width: 300px;"><a href="http://www.imdb.com%s" target="_blank">%s</a> <i>%s</i><br /> <input type="button" value="Выбрать" id="%s" class="imdb_person_list_select" /></div>' % (i['link'].encode('utf-8'), i['title'], i['details'], i['id']) txt += '<br /><div>Или укажите ссылку на страницу персоны IMDb:<br /><input type="text" value="" size="40" class="imdb_person_url" /> <input type="button" value="Искать" class="imdb_person_list_select" /><input type="hidden" value="%s" id="pid" /></div>' % person.id return simplejson.dumps({ 'status': True, 'content': txt, 'query': name, }) return simplejson.dumps({}) except Exception as e: open('errors.txt','a').write('%s * (%s)' % (dir(e), e.args))
def get_okinoua_cities(): """ Парсинг городов Украины """ source = ImportSources.objects.get(url='http://www.okino.ua/') # Получаем список городов с таблицы SourceCities в виде списка cities_ids = get_source_data(source, 'city', 'list') data_nof_city = '' # Открываем страницу с городами url = '%skinoafisha-kiev/' % source.url req = urllib.urlopen(url) if req.getcode() == 200: page = BeautifulSoup(req.read(), from_encoding="utf-8") # Находим все теги с городами и считываем из них id и названия городов for ul in page.findAll('ul', {'class': 'blist'}): for li in ul.findAll('li'): id = li.a.get('href').replace('/', '') name = li.a.string.encode('utf-8').strip() name_slug = low(del_separator(name)) # Сравниваем полученные города с городами в нашей БД и, если НЕТ совпадений, то if id not in cities_ids: # идентифицируем новый город city = City.objects.filter(name__name=name_slug, name__status=2).distinct('pk') # если идентифицировали, то записываем в таблицу SourceCities if city.count() == 1: SourceCities.objects.create( source_id=id, source_obj=source, city=city[0], name=name, ) # в противном случаем записываем ненайденые города в xml для дальнейших действий над ними else: if 'slug="%s"' % name_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( name, name_slug) create_dump_file('okinoua_nof_city', settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) cron_success('html', 'okinoua', 'cities', 'Укр. города')
def get_rambler_cities(): source = ImportSources.objects.get(url='http://www.rambler.ru/') cities_ids = get_source_data(source, 'city', 'list') data_nof_city = '' ''' # LOCALHOST f = open('%s/dump_rambler_city.xml' % settings.API_DUMP_PATH, 'r') xml = BeautifulSoup(f.read(), from_encoding="utf-8") f.close() if xml: # --- end localhost ''' # SERVER url = 'http://api.kassa.rambler.ru/v2/%s/xml/cities/' % RAMBLER_API_KEY # dump_rambler_city.xml req = urllib.urlopen(url) if req.getcode() == 200: xml = BeautifulSoup(req.read(), from_encoding="utf-8") # --- end server for i in xml.findAll('city'): id = i.cityid.string name = i.find('name').string.encode('utf-8') name_slug = low(del_separator(name)) if id not in cities_ids: city = City.objects.filter(name__name=name_slug, name__status=2).distinct('pk') if city.count() == 1: SourceCities.objects.create( source_id=id, source_obj=source, city=city[0], name=name, ) else: if 'slug="%s"' % name_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( name, name_slug) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) cron_success('xml', source.dump, 'cities', 'Города')
def get_name_film_obj(film): ''' Получение объекта названия фильма ''' # очищаю названия от формата изображения (3D, 2D ...) f = del_screen_type(film) # очищаю названия от спец.символов и привожу в нижний регистр f = low(del_separator(f)) # ищу по очищенному названию try: name = NameProduct.objects.filter(name=f)[0] except IndexError: # если не найден, ищу по названию источника try: name = NameProduct.objects.filter(name=film)[0] except IndexError: # если не найден, ищу по названию источника в нижнем регистре try: name = NameProduct.objects.filter(name=low(film))[0] except IndexError: # если не найден, ищу по названию источника в нижнем регистре с заглавной буквы try: name = NameProduct.objects.filter(name=capit(film))[0] except IndexError: name = None return name
def get_premierzal_cities(): source = ImportSources.objects.get(url='http://www.premierzal.ru/') cities = get_source_data(source, 'city', 'list') data_nof_city = '' req = urllib.urlopen(source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) block = data.find('div', {'class': 'drop'}) for i in block.findAll('a'): city_name = i.text.encode('utf-8').strip() city_id = low(del_separator(city_name)) if city_id.decode('utf-8') not in cities: city = City.objects.filter(name__name=city_id, name__status=2).distinct('pk') if city.count() == 1: SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) else: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_id) cities.append(city_id.decode('utf-8')) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) cron_success('html', source.dump, 'cities', 'Города')
def get_kinohod_cinemas(): # print "BEGIN get_kinohod_cinemas()" t1 = time.time() start_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data_new = '' cron_data_nof = '' cron_count = 0 main_url = 'http://www.kinohod.ru/api/rest/partner/v1/cinemas?apikey=%s' % SERVER_API_KEY source = ImportSources.objects.get(url='http://kinohod.ru/') kinohod_cinemas = get_source_data(source, 'cinema', 'list') kinohod_cities_dict = get_source_data(source, 'city', 'dict') cinemass = Cinema.objects.all() cinemass_dict = {} for i in cinemass: cinemass_dict[i.code] = i count = 0 data_nof_cinema = '' for cid, kinohod_city in kinohod_cities_dict.iteritems(): try: url = '%s&city=%s' % (main_url, cid) req = urllib.urlopen(url) if req.getcode() == 200: json_data = req.read() data = json.loads(json_data) for i in data: cron_count += 1 id = str(i['id']).decode('utf-8') if id not in kinohod_cinemas: name = i['title'] name_slug = del_screen_type(name.encode('utf-8')) name_slug = low(del_separator(name_slug)) short_name = i['shortTitle'] short_name_slug = del_screen_type( short_name.encode('utf-8')) short_name_slug = low(del_separator(short_name_slug)) filter1 = { 'name__name': name_slug, 'name__status': 2, 'city__id': kinohod_city.city_id } filter2 = { 'name__name': short_name_slug, 'name__status': 2, 'city__id': kinohod_city.city_id } cinema_kid = cinema_identification( short_name_slug, filter1, filter2) cin_obj = cinemass_dict.get(cinema_kid) if cin_obj: SourceCinemas.objects.create( source_id=id, source_obj=source, city=kinohod_city, cinema=cin_obj, name=name, name_alter=short_name, ) cron_data_new += '%s<br />' % short_name.encode( 'utf-8') else: count += 1 name_city = kinohod_city.name data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( short_name.encode('utf-8'), short_name_slug, name_city.encode('utf-8'), kinohod_city.city.kid) cron_data_nof += '%s<br />' % short_name.encode( 'utf-8') kinohod_cinemas.append(id) except IOError: open('%s/ddd.txt' % settings.API_DUMP_PATH, 'a').write(str(url) + '\n') data_nof_cinema += '<sum>%s</sum>' % count create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) # cron log end_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data = '%s | %s - %s %s\n' % (datetime.datetime.now().date(), start_time, end_time, 'Импорт кинотеатров киноход') cron_data += '<br /><b>Обработано</b>: %s' % cron_count cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof for i in range(50): cron_data += '- ' process_time = time.time() - t1 cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data) open('%s/cron_log_kinohod_cinemas.txt' % settings.CRON_LOG_PATH, 'a').write(cron_data) cron_success('json', source.dump, 'cinemas', 'Кинотеатры')
def orsk_streets_fix(request): source = ImportSources.objects.get(url='http://www.orgpage.ru/') orgs = list( Organization.objects.filter(source_obj=source).values_list('id', flat=True)) builds = Building.objects.select_related('street').filter( organization__id__in=orgs) for ind, i in enumerate(builds): if ind not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19): st_type = i.street.name st = st_type street_name = st_type street_type = '1' #return HttpResponse(str(st_type.encode('utf-8'))) if u'Набережная' in st_type or u'набережная' in st_type: street_name = st.encode('utf-8').replace( 'Набережная', '').replace('набережная', '').strip() street_type = '4' elif u'шоссе' in st_type or u'ш.' in st_type: street_name = st.encode('utf-8').replace('шоссе', '').replace( 'ш.', '').strip() street_type = '5' elif u'пл.' in st_type or u'площадь' in st_type or u'Площадь' in st_type or u'плщ.' in st_type: street_name = st.encode('utf-8').replace( 'площадь', '').replace('пл.', '').replace('плщ.', '').replace('Площадь', '').strip() street_type = '3' elif u'проезд' in st_type or u'Проезд' in st_type: street_name = st.encode('utf-8').replace('проезд', '').replace( 'Проезд', '').strip() street_type = '7' elif u'Парк' in st_type or u'парк' in st_type: street_name = st.encode('utf-8').replace('парк', '').replace( 'Парк', '').strip() street_type = '10' elif u'пер.' in st_type or u'переулок' in st_type or u'Пер.' in st_type: street_name = st.encode('utf-8').replace('пер.', '').replace( 'Пер.', '').replace('переулок', '').strip() street_type = '2' elif u'пр-кт' in st_type or u'проспект' in st_type or u'Пр.' in st_type or u'просп.' in st_type: street_name = st.encode('utf-8').replace('пр-кт', '').replace( 'проспект', '').replace('Пр.', '').replace('просп.', '').strip() street_type = '6' elif u'ул.' in st_type or u'улица' or u'Ул.' in st_type: street_name = st.encode('utf-8').replace('ул.', '').replace( 'Ул.', '').replace('улица', '').strip() street_type = '1' street_name = street_name.replace('ул.', '').replace('улица', '').replace( 'Ул.', '').strip() name_slug = low(del_separator(street_name)) i.street.name = street_name i.street.slug = name_slug i.street.type = street_type i.street.save() return HttpResponse(str())
def get_orsk_organizations(): source = ImportSources.objects.get(url='http://www.orgpage.ru/') org_phones = OrganizationPhones.objects.filter( organization__source_obj=source) phones_objs = {} for i in org_phones: phones_objs[i.phone] = i org_tags = OrganizationTags.objects.all() tags_objs = {} for i in org_tags: tags_objs[i.name] = i org_streets = Street.objects.all() org_streets_dict = {} for i in org_streets: org_streets_dict[i.slug.encode('utf-8')] = i source_ids = list( Organization.objects.filter(source_obj=source).values_list('source_id', flat=True)) city_name = 'Орск' city = City.objects.get(name__name=city_name, name__status=1) ''' # 1 urls = [ 'http://www.orgpage.ru/орск_и_городской_округ_орск/администрация,_органы_исполнительной_власти/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/бизнес/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/досуг/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/жкх_и_благоустройство/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/культура/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/магазины/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/медицина/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/наука/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/образование/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/общественные_организации/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/посольства/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/промышленность/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/ремонт/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/организации_социального_комплекса/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/строительство/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/оптовая_торговля,_поставка_оборудования/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/транспорт/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/туризм_и_отдых/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/управление_и_контроль/', 'http://www.orgpage.ru/орск_и_городской_округ_орск/услуги/', ] xml = '' count = 0 for url in urls: req = urllib.urlopen(url) count += 1 if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") try: div = data.find('div', {'class': 'r_alphabet'}) for link in div.findAll('li'): name = link.a.text.encode('utf-8') link = link.a.get('href').encode('utf-8') xml += '<url link="%s" name="%s"></url>' % (link, name) except AttributeError: pass if count % 5 == 0: time.sleep(random.uniform(1.0, 3.0)) open('organizations_orsk.xml', 'w').write(str('<data>%s</data>' % xml)) ''' ''' # 2 with open('%s/organizations_orsk.xml' % settings.API_DUMP_PATH, 'r') as f: data = BeautifulSoup(f.read(), from_encoding="utf-8") xml = '' count = 0 for i in data.findAll('url'): url = '%s?order=date&onpage=100' % i['link'].encode('utf-8') tag = i['name'].encode('utf-8') req = urllib.urlopen(url) count += 1 if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") for a in data.findAll('a', {'class': 'name'}): name = a.text.encode('utf-8').replace('"', "'") link = a.get('href').encode('utf-8') xml += '<url link="%s" name="%s" tag="%s"></url>' % (link, name, tag) if count % 10 == 0: time.sleep(random.uniform(1.0, 3.0)) open('organizations_orsk2.xml', 'w').write(str('<data>%s</data>' % xml)) ''' ''' # 3 with open('%s/organizations_orsk2.xml' % settings.API_DUMP_PATH, 'r') as f: data = BeautifulSoup(f.read(), from_encoding="utf-8") orgs = {} for i in data.findAll('url'): url = i['link'].encode('utf-8') name = i['name'].encode('utf-8') tag = i['tag'].encode('utf-8') if orgs.get(url): orgs[url]['tag'].append(tag) else: orgs[url] = {'tag': [tag], 'name': name, 'url': url} xml = '' for i in orgs.values(): xml += '<url name="%s" url="%s">' % (i['name'], i['url']) for t in i['tag']: xml += '<tag name="%s"></tag>' % t xml += '</url>' open('organizations_orsk3.xml', 'w').write(str('<data>%s</data>' % xml)) ''' with open('%s/organizations_orsk3.xml' % settings.API_DUMP_PATH, 'r') as f: data = BeautifulSoup(f.read(), from_encoding="utf-8") count = 0 for i in data.findAll('url'): count += 1 url = i['url'].encode('utf-8') title = i['name'].encode('utf-8') uni = unidecode(i['name']) uni = re.findall(ur'[a-z0-9]+', low(uni)) uni = '-'.join(uni) if uni else '' source_id = url.replace( 'http://www.orgpage.ru/орск_и_городской_округ_орск/', '').decode('utf-8') if source_id not in source_ids: req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") address = data.find('span', itemprop='streetAddress') if address: address = address.text.encode('utf-8') street_name, street_type, house = get_org_street( address.decode('utf-8')) if street_type: if street_name: street_slug = low(del_separator(street_name)) street_obj = org_streets_dict.get(street_slug) if not street_obj: street_obj = Street.objects.create( name=street_name, slug=street_slug, type=street_type) org_streets_dict[street_slug] = street_obj else: street_obj = None house = None building_obj = org_build_create( house, city, street_obj) # ТЕЛЕФОНЫ, САЙТ, МЭЙЛ phones = [] for ph in data.findAll('span', itemprop='telephone'): ph = ph.text.encode('utf-8').replace( ' ', '').replace('-', '').replace('–', '') phone = REG_PHONE.findall(ph) if phone: phone = phone[0].replace('(', '').replace(')', '') phone = phone.decode('utf-8') phone_obj = phones_objs.get(phone) if not phone_obj: phone_obj = OrganizationPhones.objects.create( phone=phone) phones_objs[phone] = phone_obj phones.append(phone_obj) site = None site_block = data.find('li', id='list_sites') if site_block: site = site_block.find('a', itemprop='url') if site: site = site.text.encode('utf-8') email = None email = data.find('span', itemprop='email') if email: email = email.text.encode('utf-8') # МЕТКИ (КАТЕГОРИИ, ТЕГИ) tags = [] for cat in i.findAll('tag'): category_name = cat['name'] category_obj = tags_objs.get(category_name) if not category_obj: category_obj = OrganizationTags.objects.create( name=category_name) tags_objs[category_name] = category_obj tags.append(category_obj) org_obj = Organization.objects.create( name=title, site=site, email=email, note=None, source_obj=source, source_id=source_id, ) org_obj.uni_slug = '%s-%s' % (uni, org_obj.id) org_obj.save() for j in phones: org_obj.phones.add(j) for j in tags: org_obj.tags.add(j) org_obj.buildings.add(building_obj) source_ids.append(source_id) if count % 10 == 0: time.sleep(random.uniform(1.0, 3.0)) return HttpResponse(str())
def get_0654_organizations(): source = ImportSources.objects.get(url='http://m.0654.com.ua/') org_phones = OrganizationPhones.objects.filter( organization__source_obj=source) phones_objs = {} for i in org_phones: phones_objs[i.phone] = i org_tags = OrganizationTags.objects.filter(organization__source_obj=source) tags_objs = {} for i in org_tags: tags_objs[i.name] = i org_streets = Street.objects.all() org_streets_dict = {} for i in org_streets: org_streets_dict[i.slug.encode('utf-8')] = i source_ids = list( Organization.objects.filter(source_obj=source).values_list('source_id', flat=True)) city_name = 'Ялта' city = City.objects.get(name__name=city_name, name__status=1) with open('%s/organizations.xml' % settings.API_DUMP_PATH, 'r') as f: data = BeautifulSoup(f.read(), from_encoding="utf-8") temp = {} count = 0 for i in data.findAll('url'): count += 1 url = i['value'].encode('utf-8') source_id = url.replace('http://m.0654.com.ua/catalog/full/', '').decode('utf-8') if source_id not in source_ids: req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") phones_temp = [] phones = [] streets = [] tags = [] email = None site = None # НАЗВАНИЕ title = data.find('h2').text.encode('utf-8').split( ' ')[0].strip() # ОПИСАНИЕ description = str(data.find('div', {'class': 'discription'})).replace( '<div class="discription">', '').replace('</div>', '').strip() if not description: description = None # АДРЕСА address = data.find('a', {'class': 'addr'}) if address: # если несколько адресов address_temp = address.string.encode('utf-8').split(';') for addr in address: street_name, street_type, house = get_org_street(addr) if street_type: if street_name: street_slug = low(del_separator(street_name)) street_obj = org_streets_dict.get(street_slug) if not street_obj: street_obj = Street.objects.create( name=street_name, slug=street_slug, type=street_type) org_streets_dict[street_slug] = street_obj else: street_obj = None house = None building_obj = org_build_create( house, city, street_obj) streets.append(building_obj) else: building_obj = org_build_create(None, city, None) streets.append(building_obj) street_type = True if street_type: # ТЕЛЕФОНЫ, САЙТ, МЭЙЛ table = data.find('table', {'class': 'har'}) for tr in table.findAll('tr'): td = tr.findAll('td', limit=2) if td[0].string == u'Телефон': phones_temp = td[1].div.string.encode('utf-8') phones_temp = phones_temp.replace(' ', '').replace( '-', '').replace('–', '').split(';') elif td[0].string == u'Email адрес': email = td[1].div.a.string.encode('utf-8') elif td[0].string == u'Сайт': site = td[1].div.a.string.encode('utf-8') for phone in phones_temp: phone = REG_PHONE.findall(phone) if phone: phone = phone[0].decode('utf-8') phone_obj = phones_objs.get(phone) if not phone_obj: phone_obj = OrganizationPhones.objects.create( phone=phone) phones_objs[phone] = phone_obj phones.append(phone_obj) # МЕТКИ (КАТЕГОРИИ, ТЕГИ) for cat in i.findAll('cat'): category_name = cat['value'] category_obj = tags_objs.get(category_name) if not category_obj: category_obj = OrganizationTags.objects.create( name=category_name) tags_objs[category_name] = category_obj tags.append(category_obj) org_obj = Organization.objects.create( name=title, site=site, email=email, note=description, source_obj=source, source_id=source_id, ) for j in phones: org_obj.phones.add(j) for j in tags: org_obj.tags.add(j) for j in streets: org_obj.buildings.add(j) source_ids.append(source_id) if count % 10 == 0: time.sleep(random.uniform(1.0, 3.0)) return HttpResponse(str('finish'))
def get_yovideo(): source = ImportSources.objects.get(url='http://www.yo-video.net/') sfilm_clean(source) today = datetime.datetime.now() french_month = { '1': 'janvier', '2': 'fevrier', '3': 'mars', '4': 'avril', '5': 'mai', '6': 'juin', '7': 'juillet', '8': 'aout', '9': 'septembre', '10': 'octobre', '11': 'novembre', '12': 'decembre', } data_nof_film = '' noffilms = [] ignored = get_ignored_films() films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) main_urls = [] for i in range(today.month, 13): m = french_month.get(str(i)) url = '%sfr/sorties/cinema/%s/%s/' % (source.url, today.year, m) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") for h2 in data.findAll('h2'): day = h2.findAll('span', limit=1)[0].string.encode('utf-8') time.sleep(1) req2 = urllib.urlopen('%s%s' % (url, day)) if req2.getcode() == 200: data2 = BeautifulSoup(req2.read(), from_encoding="utf-8") release_date = datetime.date(today.year, int(i), int(day)) for film_block in data2.findAll('div', {'class': 'sfilm'}): film_id = film_block.find('a').get('href').encode('utf-8') full_url = '%s%s' % (source.url, film_id.lstrip('/')) name = film_block.find('img').get('alt').encode('utf-8').replace('Film ', '') slug = low(del_separator(name)) if slug.decode('utf-8') not in ignored and film_id not in noffilms: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: kid = None if obj: kid = obj.kid if not kid: req3 = urllib.urlopen(full_url) if req3.getcode() == 200: data3 = BeautifulSoup(req3.read(), from_encoding="utf-8") h3 = data3.find('h3') alter_name = None alter_name_slug = None if h3: alter_name = h3.string.encode('utf-8') alter_name_slug = low(del_separator(alter_name)) kid, info = film_identification(slug, alter_name_slug, {}, {}, source=source) txt = None if not kid: div = data3.find('div', {'class': "filmLeft"}) img_url = div.find('img').get('src').encode('utf-8') details = data3.find('div', {'class': "details"}) director = details.find('span', itemprop="name") if director: director = director.string.encode('utf-8').strip() year = re.findall(ur'Année\s?\: \d+', details.text) if year: year = year[0].encode('utf-8').replace('Année','').replace(':','').strip() txt = '%s;%s;%s;%s' % (full_url.encode('utf-8'), img_url, director, year) kid = None objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, name, name_alt=alter_name, txt=txt, extra=release_date) films[film_id] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) else: if not obj: new = create_sfilm(film_id, kid, source, name, name_alt=alter_name, txt=txt, extra=release_date) films[film_id] = new if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(new) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'releases', 'Франц.релизы')
def get_kinohod_films(): # print "BEGIN get_kinohod_films()" ignored = get_ignored_films() t1 = time.time() start_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data_new = '' cron_data_nof = '' cron_count = 0 noffilms = [] source = ImportSources.objects.get(url='http://kinohod.ru/') sfilm_clean(source) kinohod_cities = get_source_data(source, 'city', 'list') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) data_nof_films = '' main_url = 'http://www.kinohod.ru/api/rest/partner/v1/movies?apikey=%s' % SERVER_API_KEY for city_id in kinohod_cities: try: url = '%s&city=%s' % (main_url, city_id) req = urllib.urlopen(url) if req.getcode() == 200: json_data = req.read() data = json.loads(json_data) for i in data: cron_count += 1 film_id = str(i['id']).decode('utf-8') year = int( i['productionYear']) if i['productionYear'] else None name_ru = i['title'].encode('utf-8') name_ru_slug = low(del_separator(del_screen_type(name_ru))) full_url = '%smovie/%s/' % (source.url, film_id) name_en = None name_en_slug = None if i['originalTitle']: name_en = i['originalTitle'].encode('utf-8') name_en_slug = low( del_separator(del_screen_type(name_en))) if year and name_ru_slug.decode( 'utf-8' ) not in ignored and film_id not in noffilms: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: try: if obj: kid = obj.kid else: kid, info = film_identification( name_ru_slug, name_en_slug, {}, {}, year=year, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: new = create_sfilm(film_id, kid, source, name_ru, name_alt=name_en, year=year) films[film_id] = new if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append(new) cron_data_new += '%s<br />' % name_ru elif not obj: if not name_en: name_en = '*' name_en_slug = '*' data_nof_films += xml_noffilm( name_ru, name_ru_slug, name_en, name_en_slug, film_id.encode('utf-8'), info, full_url.encode('utf-8'), source.id) noffilms.append(film_id) cron_data_nof += '%s<br />' % name_ru except db.backend.Database._mysql.OperationalError: pass except IOError: open('%s/ddd.txt' % settings.API_DUMP_PATH, 'a').write(str(url) + '\n') create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) # cron log end_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data = '%s | %s - %s %s\n' % (datetime.datetime.now().date(), start_time, end_time, 'Импорт фильмов киноход') cron_data += '<br /><b>Обработано</b>: %s' % cron_count cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof for i in range(50): cron_data += '- ' process_time = time.time() - t1 cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data) open('%s/cron_log_kinohod_films.txt' % settings.CRON_LOG_PATH, 'a').write(cron_data) cron_success('json', source.dump, 'films', 'Фильмы')
def get_kinohod_cities(): # print "BEGIN get_kinohod_cities()" t1 = time.time() start_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data_new = '' cron_data_nof = '' cron_count = 0 url = 'http://www.kinohod.ru/api/rest/partner/v1/cities?apikey=%s' % SERVER_API_KEY source = ImportSources.objects.get(url='http://kinohod.ru/') req = urllib.urlopen(url) if req.getcode() == 200: kinohod_cities = get_source_data(source, 'city', 'list') data_nof_city = '' json_data = req.read() data = json.loads(json_data) for i in data: cron_count += 1 id = str(i['id']).decode('utf-8') if id not in kinohod_cities: alias = i['alias'] name = i['name'].encode('utf-8') name_slug = del_screen_type(low(del_separator(name))) city = City.objects.filter(name__name=name_slug, name__status=2).distinct('pk') if city.count() == 1: SourceCities.objects.create( source_id=id, source_obj=source, city=city[0], name=name, name_alter=alias, ) cron_data_new += '%s<br />' % name else: data_nof_city += '<city name="%s" slug="%s"></city>' % ( name, name_slug) cron_data_nof += '%s<br />' % name kinohod_cities.append(id) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) # cron log end_time = datetime.datetime.now().strftime('%H:%M:%S') cron_data = '%s | %s - %s %s<br />' % (datetime.datetime.now().date(), start_time, end_time, 'Импорт городов киноход') cron_data += '<br /><b>Обработано</b>: %s' % cron_count cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof for i in range(50): cron_data += '- ' process_time = time.time() - t1 cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data) open('%s/cron_log_kinohod_cities.txt' % settings.CRON_LOG_PATH, 'a').write(cron_data) cron_success('json', source.dump, 'cities', 'Города')
def search(request): query_orig = request.GET.get('query', '') category = request.GET.get('filter', '') objs = [] msg = '' element = '' data = {} count = 0 if query_orig: query = low(query_orig) slug = del_separator(query) if len(query) > 2: # фильмы if category == '1': data = [] tmp_query = query.decode('utf-8').encode('ascii', 'xmlcharrefreplace') element = 'фильмов' try: ''' Получаем ID фильмов по поисковому запросу upd : проверка ч/з пересечение множеств показала, что такой вариант идентичен предыдущему(конкатенации двух сетов) upd : убрал поиск по слагу, т.к. формирование слага происходит посредством склейки всех слов из тайтла, а это приводит к нерелевантным результатам поиска (см. "San tau jin zi lei saam" фильм) ''' ids = set(list( FilmsName.objects.using('afisha').filter( Q(name__icontains = query), status=1 ).values_list('film_id', flat=True))) objs = FilmsName \ .objects \ .using('afisha') \ .only( 'name', 'film_id', 'film_id__description', 'film_id__year', 'film_id__genre1', 'film_id__genre2', 'film_id__imdb', ) \ .select_related('film_id') \ .filter(film_id__id__in=ids, status=1, type=2) \ .order_by('-film_id__year','name') tmp = None tmp_first = [] relevant = [] nonrelevant = [] for i in objs: ''' Получаем постер ''' temp_poster = Objxres.objects.using('afisha').select_related('extresid').filter( objtypeid__in=[301, 300], objpkvalue=i.film_id_id) posters = {} for p in temp_poster: if posters.get(p.objpkvalue): if p.objtypeid == 301: posters[p.objpkvalue]['poster'].append(p) else: posters[p.objpkvalue]['slides'].append(p) else: posters[p.objpkvalue] = {'poster': [], 'slides': []} if p.objtypeid == 301: posters[p.objpkvalue]['poster'].append(p) else: posters[p.objpkvalue]['slides'].append(p) if posters and posters[i.film_id_id]['poster']: i.poster = film_poster2(posters[i.film_id_id]['poster'], 'big') ''' Получаем мету ''' meta = Film.objects.using('afisha').get(id=i.film_id_id) ''' Из полученных метаданных получаем наименования жанров ''' i.genres = AfishaGenre.objects.using('afisha').filter( pk__in=[meta.genre1_id, meta.genre2_id, meta.genre3_id]) ''' Из полученных метаданных получаем рейтинг по IMDB грязнохак - шаблонизатор django не понимает числовых циклов с условиями ''' if(meta.imdb): i.imdb = {} imdb_val = meta.imdb i.imdb['range'] = range( 0, int( math.ceil( float( imdb_val.replace(',', '.') ) ) ) ,1) i.imdb['value'] = meta.imdb ''' Получаем рейтинг взвешенный по системе ''' int_rate, show_ir, show_imdb, rotten = check_int_rates(i.film_id_id) i.rating = {'rate': int_rate, 'show_ir': show_ir, 'show_imdb': show_imdb, 'rotten': rotten} ''' upd: переработка сортировки по релевантности, признак - наличие поисковой фразы в описании ''' tmp = i.film_id_id if(tmp not in tmp_first): if( (meta.description is not None) and (meta.description.find(tmp_query)) ): relevant.append(i) else: nonrelevant.append(i) tmp_first.append(tmp) data = relevant+nonrelevant count = len(tmp_first) if count == 1: return HttpResponseRedirect(reverse('get_film', kwargs={'id': tmp})) except db.backend.Database._mysql.OperationalError: pass # персоны elif category == '2': data = {'first': [], 'middle': [], 'last': []} element = 'персон' tmp_query = query.decode('utf-8').encode('ascii', 'xmlcharrefreplace') ids1 = set(list(NamePerson.objects.exclude(person__kid=None).filter(Q(name__iexact=tmp_query) | Q(name__istartswith=slug), status__in=(1,2)).values_list('person__id', flat=True))) ids2 = set(list(NamePerson.objects.exclude(person__id__in=ids1, person__kid=None).filter(name__icontains=slug, status=2).values_list('person__id', flat=True))) ids = set(list(ids1) + list(ids2)) objs1 = list(NamePerson.objects.filter(person__id__in=ids, status=1, language__id=1).order_by('name').values('person__id', 'name', 'person__kid')) result_ids = set([i['person__id'] for i in objs1]) objs2 = list(NamePerson.objects.exclude(person__id__in=result_ids).filter(person__id__in=ids, status=1, language__id=2).order_by('name').values('person__id', 'name', 'person__kid')) tmp = None tmp_first = [] for objs in [objs1, objs2]: for i in objs: tmp = i['person__kid'] if long(i['person__id']) in ids1: if low(i['name'].encode('utf-8')) == tmp_query: if tmp not in tmp_first: data['first'].append((i['person__kid'], i['name'])) tmp_first.append(tmp) else: if tmp not in tmp_first: data['middle'].append((i['person__kid'], i['name'])) tmp_first.append(tmp) else: if tmp not in tmp_first: data['last'].append((i['person__kid'], i['name'])) tmp_first.append(tmp) count = len(tmp_first) if count == 1: return HttpResponseRedirect(reverse('get_person', kwargs={'id': tmp})) elif category == '3': # music if request.subdomain == 'music': data = {'first': [], 'middle': [], 'last': [], 'artist': []} element = '' tmp_query = query.decode('utf-8').encode('ascii', 'xmlcharrefreplace') ids1 = set(list(Composition.objects.filter(Q(name__name__iexact=tmp_query) | Q(name__name__istartswith=slug), name__status__in=(2,5)).values_list('id', flat=True))) ids2 = set(list(Composition.objects.exclude(pk__in=ids1).filter(name__name__icontains=slug, name__status__in=(2,5)).values_list('id', flat=True))) ids = set(list(ids1) + list(ids2)) comp_rels = {} for i in CompositionPersonRel.objects.filter(composition__pk__in=ids, type__name='исполнение').values('person', 'person__name__name', 'composition'): comp_rels[i['composition']] = {'pid': i['person'], 'pname': i['person__name__name']} objs1 = list(CompositionName.objects.filter(composition__pk__in=ids, status=2).order_by('name').values('composition__id', 'name')) tmp = None tmp_first = [] for i in objs1: tmp = i['composition__id'] if long(i['composition__id']) in ids1: if low(i['name'].encode('utf-8')) == query: if tmp not in tmp_first: artsit = comp_rels.get(i['composition__id']) data['first'].append((i['composition__id'], i['name'], artsit)) tmp_first.append(tmp) else: if tmp not in tmp_first: artsit = comp_rels.get(i['composition__id']) data['middle'].append((i['composition__id'], i['name'], artsit)) tmp_first.append(tmp) else: if tmp not in tmp_first: artsit = comp_rels.get(i['composition__id']) data['last'].append((i['composition__id'], i['name'], artsit)) tmp_first.append(tmp) count = len(tmp_first) artists = Person.objects.filter(Q(name__name__iexact=tmp_query) | Q(name__name__istartswith=slug) | Q(name__name__icontains=slug), artist=True, name__status=4).values('id', 'name__name') data['artist'] = artists count += artists.count() # Кинотеатры else: element = 'кинотеатров' tmp_query = query.decode('utf-8').encode('ascii', 'xmlcharrefreplace') ids1 = set(list(Organization.objects.exclude(kid=None).filter(Q(name__iexact=tmp_query) | Q(name__istartswith=slug)).values_list('id', flat=True))) ids2 = set(list(Organization.objects.exclude(Q(id__in=ids1) | Q(kid=None)).filter(name__icontains=slug).values_list('id', flat=True))) ids = set(list(ids1) + list(ids2)) buildings = list(Building.objects.filter(organization__id__in=ids, city__name__status=1).values('city', 'city__name__name', 'city__country', 'city__country__name', 'city__kid', 'organization', 'organization__name', 'organization__uni_slug')) cities = {} count = 0 tmp = None for i in buildings: count += 1 tmp = i['organization__uni_slug'] if not cities.get(i['city']): cities[i['city']] = { 'id': i['city'], 'name': i['city__name__name'], 'country': i['city__country__name'], 'cinemas': [] } cities[i['city']]['cinemas'].append({'name': i['organization__name'], 'slug': i['organization__uni_slug']}) cities = sorted(cities.values(), key=operator.itemgetter('name')) data = {} for i in cities: if not data.get(i['country']): data[i['country']] = [] data[i['country']].append(i) if count == 1: return HttpResponseRedirect(reverse('organization_cinema', kwargs={'id': tmp})) else: msg = 'Слишком короткий запрос' return render_to_response('search_result.html', {'objs': data, 'element': element, 'msg': msg, 'srch_category': category, 'query': query_orig, 'count': count}, context_instance=RequestContext(request))
def parse_data_ident(request, selected): """Функция для идентификации полученных записей """ #try: debug_logs("start ident %s " % selected) # Начинаем отчет времени выполнения фукнции start = time.time() data_nof_film = '' noffilms = [] ignored = get_ignored_films() # Задаем тип идентификации, для передачи в качестве параметра в функцию идентификации ident_type = 'movie_online' # Делаем выборку всех фильмов из базы с пометкой (afisha_id=None), # так помечаются все фильмы при парсинге, # это фильмы, которые не разу не проходифшие идентификацию киноафиши data = MovieMegogo.objects.filter(afisha_id__in=(0, None)) # Получаем необходимые для идентификации параметры, # проходим итерациями в цикле для каждого отдельного фильма for i in data: year = i.year name_ru = i.title name_en = i.title_en country = i.country # Отчищаем названия ru en для идентификации фильма name_ru_slug = del_separator(low(name_ru)) name_en_slug = del_separator(low(name_en)) # Задаем диапазон лет для идентификации фильма new_year = year + 2 old_year = year - 2 filter_year = {'year__gte': old_year, 'year__lte': new_year} try: # Передаем фильм в функцию на идентификацию kid, info = film_identification(name_ru_slug, name_en_slug, {}, {}, filter_year, ident_type, country) if kid: # Записываем результат в модель i.afisha_id = kid i.save() else: if i.megogo_id not in noffilms and name_ru_slug.decode( 'utf-8') not in ignored: data_nof_film += xml_noffilm(name_ru.encode('utf-8'), name_ru_slug, None, None, i.megogo_id, info, i.page.encode('utf-8')) noffilms.append(i.megogo_id) except db.backend.Database._mysql.OperationalError: if i.megogo_id not in noffilms and name_ru_slug.decode( 'utf-8') not in ignored: data_nof_film += xml_noffilm(name_ru.encode('utf-8'), name_ru_slug, None, None, i.megogo_id, None, i.page.encode('utf-8')) noffilms.append(i.megogo_id) # Время выполнения функции finish = time.time() timer = "%.2f мин" % ((float(finish - start)) / 60) debug_logs("finish") debug_logs("timer: %s " % timer) debug_logs("Идентификация: название %s / инфо %s %s" % (name_ru_slug, kid, info)) source = ImportSources.objects.get(url='http://megogo.net/') create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) # Возвращаемся в интерфейс return simplejson.dumps({ 'request_type': 1, 'timer': timer, })
def get_megamag(): ''' Получение urls фильмов ''' import cookielib def give_me_cookie(): cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie), urllib2.HTTPHandler()) return opener ignored = get_ignored_films() ignored_cinemas = get_ignored_cinemas() source = ImportSources.objects.get(url='http://megamag.by/') sfilm_clean(source) megamag_cities_dict = get_source_data(source, 'city', 'dict') megamag_cinemas_dict = get_source_data(source, 'cinema', 'dict') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) cities_data = {} data_nof_films = '' data_nof_cinema = '' data_nof_city = '' noffilms = [] schedules_data = [] opener = give_me_cookie() req = opener.open(urllib2.Request('http://kinoteatr.megamag.by/index.php')) event_dict = {} if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") cities = data.find('div', id="box-region") for i in cities.findAll('a'): city_name = i.text.encode('utf-8') city_slug = low(del_separator(city_name)) city_id = i.get('href').replace( 'http://kinoteatr.megamag.by/index.php?region_id=', '') mcity = megamag_cities_dict.get(city_id) if not mcity: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: mcity = SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) else: if 'slug="%s"' % city_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_slug) if mcity: cities_data[city_name] = mcity try: cinemas_tag = data.findAll('td', {'class': 'Cinema_new_box_1_BoxText'}, limit=1)[0] except IndexError: cinemas_tag = None if cinemas_tag: for i in cinemas_tag.findAll('a'): cinema_url = i.get('href') cinema_id = cinema_url.replace( 'http://kinoteatr.megamag.by/index.php?cPath=', '') cinema_obj = megamag_cinemas_dict.get(cinema_id) opener = give_me_cookie() try: req2 = opener.open(urllib2.Request(cinema_url)) if req2.getcode() == 200: schedules_page = BeautifulSoup(req2.read(), from_encoding="utf-8") city_name = schedules_page.findAll( 'div', {'class': 'object_param_value'}, limit=1)[0].text.encode('utf-8') city_obj = cities_data.get(city_name) if city_obj: cinema_name = schedules_page.find( 'div', { 'class': 'object_title' }).text.encode('utf-8') cinema_name = cinema_name.replace('"', '').replace( 'Кинотеатр', '') cinema_slug = low(del_separator(cinema_name)) cinema_ig_id = u'%s__%s' % ( cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_ig_id not in ignored_cinemas: if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city } cinema_kid = cinema_identification( cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get( code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) except Cinema.DoesNotExist: pass else: cinema_kid = cinema_obj.cinema.code if cinema_kid: for event in schedules_page.findAll( 'td', {'class': 'eventsHeading'}): if event.a.get('name'): ev = event.a['name'].split('_')[1] fname = event.a.text.encode( 'utf-8') fid = event.a.get('href').replace( 'http://kinoteatr.megamag.by/newsdesk_info.php?newsdesk_id=', '') event_dict[int(ev)] = { 'name': fname, 'id': int(fid) } links = [] for td in schedules_page.findAll( 'td', {'class': 'main'}): for link in td.findAll('a'): l = link.get('href') if l and 'cPath' in l: links.append(l) schedules_data.append({ 'mcity': city_obj, 'city': city_obj.city, 'mcinema': cinema_obj, 'cinema': cinema_kid, 'schedules': set(links) }) else: if 'slug="%s"' % cinema_slug not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_name, city_obj.city.kid) except httplib.HTTPException: pass create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) megamag = get_source_data(source, 'schedule', 'list') for obj in schedules_data: cinema_object = obj['mcinema'] for index, i in enumerate(obj['schedules']): opener = give_me_cookie() try: req3 = opener.open(urllib2.Request(i)) if req3.getcode() == 200: id_schedule = i.replace( 'http://kinoteatr.megamag.by/index.php?cPath=', '').encode('utf-8') if id_schedule not in megamag: sch_page = BeautifulSoup(req3.read(), from_encoding="utf-8") tables = sch_page.findAll('table', { 'class': 'Cinema_new_box_2_TemplateCenterPart' }, limit=1)[0] main_table = tables.findAll('table', cellpadding='4', limit=1)[0] tr = main_table.findAll('tr')[1] td = tr.findAll('strong') event_id = id_schedule.split('_')[2] film_data = event_dict.get(int(event_id)) if film_data: film_name = film_data['name'] film_name_slug = low( del_separator(del_screen_type(film_name))) film_id = film_data['id'] if film_id not in noffilms and film_name_slug.decode( 'utf-8') not in ignored: obj = films.get( str(film_id).decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_name_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[str(film_id).decode( 'utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid][ 'script_rel'].append(objt) elif not obj: data_nof_films += xml_noffilm( film_name, film_name_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: dtime_info = td[1].text.encode( 'utf-8').split() year_info = datetime.datetime.now( ).year day_info = int(dtime_info[0]) month_low = low( dtime_info[1].replace(',', '')) month_info = int( get_month(month_low)) time_info = dtime_info[-1].replace( '(', '').replace(')', '').split(':') dtime = datetime.datetime( year_info, month_info, day_info, int(time_info[0]), int(time_info[1]), 0) SourceSchedules.objects.create( source_id=id_schedule, source_obj=source, cinema=cinema_object, film=objt, dtime=dtime, ) except httplib.HTTPException: open('%s/httplib_errors.txt' % settings.API_DUMP_PATH, 'a').write('%s\n' % i) # на каждом 60 обращении к источнику делаю паузу в 2 секунды if (index + 1) % 60 == 0: time.sleep(2.0) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_premierzal_schedules(): data_nof_film = '' noffilms = [] ignored = get_ignored_films() source = ImportSources.objects.get(url='http://www.premierzal.ru/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') cities_cinemas = {} for i in SourceCinemas.objects.select_related('city').filter( source_obj=source): if not cities_cinemas.get(i.city.source_id): cities_cinemas[i.city.source_id] = {'city': i.city, 'cinemas': []} cities_cinemas[i.city.source_id]['cinemas'].append(i) for k, v in cities_cinemas.iteritems(): city_url_encode = urllib.quote(v['city'].name.encode('utf-8')) for i in v['cinemas']: main_url = '%s?theatre=%s&city=%s' % (source.url, i.source_id, city_url_encode) main_req = urllib.urlopen(main_url) if main_req.getcode() == 200: data = BeautifulSoup(main_req.read()) data = data.find('div', id="films-list") if data: dates = [] for calendar in data.findAll('table', {'class': 'calendar'}): for a in calendar.findAll('a'): href = a.get('href', '') href_dict = dict(cgi.parse_qsl(href)) calendar_date = href_dict.get( u'?date', href_dict.get(u'date')) if calendar_date: dates.append({ 'date': calendar_date, 'href': href }) for ind, d in enumerate(dates): films_blocks = [] if ind == 0: films_blocks = data.findAll( 'div', {'class': 'film-item-wrapper'}) else: url = '%s?date=%s&city=%s&theatre=%s' % ( source.url, d['date'], city_url_encode, i.source_id) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) data = data.find('div', id="films-list") films_blocks = data.findAll( 'div', {'class': 'film-item-wrapper'}) time.sleep(random.uniform(0.8, 2.2)) for block in films_blocks: title = block.find('div', { 'class': 'title' }).find('a') film_name = title.text.encode('utf-8').strip() film_slug = low( del_separator(del_screen_type(film_name))) film_id = film_slug if film_id not in noffilms and film_slug.decode( 'utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id.decode( 'utf-8')] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id, info, None, source.id) noffilms.append(film_id) if objt: year, month, day = d['date'].split( u'-') for tm in block.findAll( 'div', {'class': 'seanse-item'}): for t in tm.text.encode( 'utf-8').split('|'): t = re.findall( r'\d{2}\:\d{2}', t) if t: hours, minutes = t[ 0].strip().split(':') dtime = datetime.datetime( int(year), int(month), int(day), int(hours), int(minutes)) sch_id = '%s%s%s' % ( dtime, i.source_id.encode( 'utf-8'), film_id) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=i, dtime=dtime, ) schedules.append( sch_id) time.sleep(random.uniform(1.1, 1.8)) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_vkinocomua_films_and_schedules(): ignored = get_ignored_films() data_nof_film = '' noffilms = [] source = ImportSources.objects.get(url='http://vkino.com.ua/') sfilm_clean(source) films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) schedules = get_source_data(source, 'schedule', 'list') cinemas_data = SourceCinemas.objects.select_related('city').filter(source_obj=source) cinemas = {} for ind, i in enumerate(cinemas_data): url = '%scinema/%s/%s/showtimes' % (source.url, i.city.source_id, i.source_id) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") main = data.find('div', id='cinema-showtimes') if main: for content in main.findAll('div', {'class': 'content'}): film_tag = content.find('a', {'class': 'navi'}) film_name = film_tag.string.encode('utf-8').strip() film_slug = low(del_separator(film_name)) full_url = film_tag.get('href').encode('utf-8') film_id = re.findall(r'\/\d+\/', full_url) if film_id: film_id = film_id[0].replace('/','').encode('utf-8') else: film_id = film_slug full_url = '%s%s' % (source.url, full_url.lstrip('/')) if film_id not in noffilms and film_slug.decode('utf-8') not in ignored: obj = films.get(film_id.decode('utf-8')) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification(film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func(fdict, kid, obj) if create_new: objt = create_sfilm(film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = {'editor_rel': [], 'script_rel': []} fdict[kid]['script_rel'].append(objt) elif not obj: data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, url.encode('utf-8'), source.id) noffilms.append(film_id) if objt: for div in content.findAll('div', {'class': 'date'}): year, month, day = div['data-date'].split('-') show = div.find_next_sibling("ul") for li in show.findAll('li'): if li.a: extra = li.a.get('href') hours, minutes = li.a.text.strip().split(':') else: extra = None hours, minutes = li.text.strip().split(':') # sale = True if extra else False dtime = datetime.datetime(int(year), int(month), int(day), int(hours), int(minutes)) sch_id = u'%s%s%s%s' % (dtime, i.source_id, i.city_id, film_id.decode('utf-8')) sch_id = sch_id.replace(' ', '') if sch_id not in schedules: SourceSchedules.objects.create( source_id = sch_id, source_obj = source, film = objt, cinema = i, dtime = dtime, extra = extra, ) schedules.append(sch_id) if ind % 4 == 0: time.sleep(random.uniform(1.0, 3.0)) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_premierzal_cinemas(): source = ImportSources.objects.get(url='http://www.premierzal.ru/') cinemas = get_source_data(source, 'cinema', 'list') cities_dict = get_source_data(source, 'city', 'dict') cinemas_dict = {} for i in Cinema.objects.all(): cinemas_dict[i.code] = i ignored_cinemas = get_ignored_cinemas() data_nof_cinema = '' city = cities_dict.values()[0] body = urllib.urlencode({ 'city': city.name.encode('utf-8'), }) url = '%stheatres?%s' % (source.url, body) req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read()) blocks = [] block1 = data.find('div', {'class': 'this_city_theatres'}) block2 = data.find('div', {'class': 'other_city_theatres'}) if block1: blocks.append(block1) if block2: blocks.append(block2) for ind, block in enumerate(blocks): for a in block.findAll('a'): cinema_name = a.text.encode('utf-8').strip().replace('"', '') cinema_id = a.get('href').replace('/theatres/', '').replace('/', '') if ind == 0: city_obj = city else: city_name, cinema_name = cinema_name.split(',') cinema_name = cinema_name.strip() city_slug = low(del_separator(city_name.strip())) city_obj = cities_dict.get(city_slug.decode('utf-8')) cinema_slug = low(del_separator(cinema_name)) if city_obj: cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_id.decode( 'utf-8' ) not in cinemas and cinema_ig_id not in ignored_cinemas: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city__id': city_obj.city_id } cinema = cinema_identification(cinema_slug, filter1) cin_obj = cinemas_dict.get(cinema) if cin_obj: SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cin_obj, name=cinema_name, ) cinemas.append(cinema_id.decode('utf-8')) else: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_obj.name.encode('utf-8'), city_obj.city.kid) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) cron_success('html', source.dump, 'cinemas', 'Кинотеатры')
def get_vkinocomua_cities_and_cinemas(): nofcities = [] nofcinemas = [] data_nof_cinema = '' data_nof_city = '' source = ImportSources.objects.get(url='http://vkino.com.ua/') cinemas_dict = get_source_data(source, 'cinema', 'dict') cities_dict = get_source_data(source, 'city', 'dict') req = urllib.urlopen('%safisha/kiev' % source.url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") cities_tag = data.find('select', id='city-selector') for ind, i in enumerate(cities_tag.findAll('option')): if i['value']: city_name = i.string.encode('utf-8') city_slug = low(del_separator(city_name)) city_id = i['value'].encode('utf-8') city_obj = cities_dict.get(city_id) if not city_obj and city_id not in nofcities: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: city_obj = SourceCities.objects.create( source_id = city_id, source_obj = source, city = city[0], name = city_name, ) cities_dict[city_id] = city_obj else: data_nof_city += '<city name="%s" slug="%s"></city>' % (city_name, city_slug) nofcities.append(city_id) if city_obj: url = '%scinema/%s' % (source.url, city_id) req_cinema = urllib.urlopen(url) if req_cinema.getcode() == 200: data_cinema = BeautifulSoup(req_cinema.read(), from_encoding="utf-8") for tag in data_cinema.findAll('a', {'class': 'cinema'}): cinema_name = tag.string.encode('utf-8') cinema_slug = low(del_separator(cinema_name)) cinema_id = tag.get('href').replace('/cinema/%s/' % city_id, '').encode('utf-8') cinema_obj = cinemas_dict.get(cinema_id) if not cinema_obj and cinema_id not in nofcinemas: filter = {'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city} cinema_kid = cinema_identification(cinema_slug, filter) if cinema_kid: try: cin_obj = Cinema.objects.get(code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id = cinema_id, source_obj = source, city = city_obj, cinema = cin_obj, name = cinema_name, ) cinemas_dict[cinema_id] = cinema_obj except Cinema.DoesNotExist: pass else: nofcinemas.append(cinema_id) data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (cinema_name, cinema_slug, city_name, city_obj.city.kid) if ind % 4 == 0: time.sleep(random.uniform(1.0, 3.0)) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) cron_success('html', source.dump, 'cities_and_cinemas', 'Города и кинотеатры')
def get_zapad24ru(): ignored = get_ignored_films() ignored_cinemas = get_ignored_cinemas() source = ImportSources.objects.get(url='http://zapad24.ru/') sfilm_clean(source) cities_dict = get_source_data(source, 'city', 'dict') cinemas_dict = get_source_data(source, 'cinema', 'dict') schedules = get_source_data(source, 'schedule', 'list') films = {} source_films = SourceFilms.objects.filter(source_obj=source) for i in source_films: films[i.source_id] = i fdict = get_all_source_films(source, source_films) today = datetime.datetime.now() next_month = datetime.date.today() + datetime.timedelta(days=40) data_nof_films = '' data_nof_cinema = '' data_nof_city = '' noffilms = [] req = urllib.urlopen('%safisha/' % source.url) if req.getcode() == 200: data = BeautifulSoup(req.read()) #, from_encoding="utf-8" div = data.find('div', align="left") for ind, table in enumerate( div.findAll('table', border="0", cellpadding="0", cellspacing="0", width="100%")): cinema_tag = table.find('strong').string.encode('utf-8') cinema_name = re.findall(r'\".+\"', cinema_tag)[0].replace('"', '').strip() cinema_slug = low(del_separator(cinema_name)) cinema_id = cinema_slug.decode('utf-8') city_name = re.findall(r'\(.+\)', cinema_tag)[0].replace( '(г. ', '').replace(')', '').strip() city_slug = low(del_separator(city_name)) city_id = city_slug.decode('utf-8') city_obj = cities_dict.get(city_id) if not city_obj: city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk') if city.count() == 1: city_obj = SourceCities.objects.create( source_id=city_id, source_obj=source, city=city[0], name=city_name, ) cities_dict[city_id] = city_obj else: if 'slug="%s"' % city_slug not in data_nof_city: data_nof_city += '<city name="%s" slug="%s"></city>' % ( city_name, city_slug) if city_obj: cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'), city_obj.city.kid) if cinema_ig_id not in ignored_cinemas: cinema_obj = cinemas_dict.get(cinema_id) if not cinema_obj: filter1 = { 'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city } cinema_kid = cinema_identification( cinema_slug, filter1) if cinema_kid: try: cinema = Cinema.objects.get(code=cinema_kid) cinema_obj = SourceCinemas.objects.create( source_id=cinema_id, source_obj=source, city=city_obj, cinema=cinema, name=cinema_name, ) cinemas_dict[cinema_id] = cinema_obj except Cinema.DoesNotExist: pass else: if 'slug="%s"' % cinema_slug not in data_nof_cinema: data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % ( cinema_name, cinema_slug, city_name, city_obj.city.kid) if cinema_obj: film_table = table.find('table') date_from = None date_to = None for tr in film_table.findAll('tr'): film_name, film_slug, film_id = (None, None, None) if ind == 0: film_name = tr.find('b').string.encode( 'utf-8').strip() film_slug = low(del_separator(film_name)) film_id = film_slug.decode('utf-8') else: showdate = '' for f in tr.findAll('b'): if f.find('span'): showdate = f.find( 'span').string.encode( 'utf-8').strip() else: film_name = f.string.encode( 'utf-8').strip() film_name = re.findall( r'\«.+\»', film_name)[0] film_name = film_name.replace( '«', '').replace('»', '').strip() film_slug = low( del_separator(film_name)) film_id = film_slug.decode('utf-8') if showdate and film_name: try: date_from, date_to = showdate.split( '-') date_from_day, date_from_month = date_from.strip( ).split('.') date_to_day, date_to_month = date_to.strip( ).split('.') except ValueError: date_from, date_to = showdate.split( ' – ') date_from_day, date_from_month = date_from.strip( ).split() date_from_month = get_month( date_from_month) date_to_day, date_to_month = date_to.strip( ).split() date_to_month = get_month( date_to_month) date_from = datetime.date( today.year, int(date_from_month), int(date_from_day)) date_to = datetime.date( today.year, int(date_to_month), int(date_to_day)) full_url = tr.find('a').get('href').encode('utf-8') if film_id not in noffilms and film_id not in ignored: obj = films.get(film_id) next_step = checking_obj(obj) if next_step: if obj: kid = obj.kid else: kid, info = film_identification( film_slug, None, {}, {}, source=source) objt = None if kid: create_new, objt = unique_func( fdict, kid, obj) if create_new: objt = create_sfilm( film_id, kid, source, film_name) films[film_id] = objt if not fdict.get(kid): fdict[kid] = { 'editor_rel': [], 'script_rel': [] } fdict[kid]['script_rel'].append( objt) elif not obj: data_nof_film += xml_noffilm( film_name, film_slug, None, None, film_id.encode('utf-8'), info, full_url, source.id) noffilms.append(film_id) if objt: req_film = urllib.urlopen(full_url) if req_film.getcode() == 200: data_film = BeautifulSoup( req_film.read() ) #, from_encoding="utf-8" td = data_film.find( 'td', { 'class': 'news' }).div.text.encode('utf-8') showtime = [] if ind == 0: showtime = re.findall( r'\d+\:\d+\s\s?', td) else: if date_from and date_to: if date_to < next_month: showtimes = re.findall( r'Начало сеансов:\s?[\d+\-\d+\,?\s?]+', td) times = [] for t in showtimes: t = t.replace( 'Начало сеансов:', '').split(',') times = [ i.strip() for i in t if i.strip() ] delta = date_to - date_from for day in range( delta.days + 1): d = date_from + datetime.timedelta( days=day) for t in times: hours, minutes = t.split( '-') dtime = datetime.datetime( d.year, d.month, d.day, int(hours), int(minutes )) showtime.append( dtime) for t in showtime: if ind == 0: hours, minutes = t.strip( ).split(':') dtime = datetime.datetime( today.year, today.month, today.day, int(hours), int(minutes)) else: dtime = t sch_id = '%s%s%s%s' % ( dtime, cinema_slug, city_slug, film_id.encode('utf-8')) sch_id = sch_id.replace( ' ', '').decode('utf-8') if sch_id not in schedules: SourceSchedules.objects.create( source_id=sch_id, source_obj=source, film=objt, cinema=cinema_obj, dtime=dtime, ) schedules.append(sch_id) create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city) create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema) create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_films) cron_success('html', source.dump, 'schedules', 'Сеансы')
def get_bigyalta_organizations(): REG_ADDR = re.compile(r'Адрес\:\s?.+') REG_TEL = re.compile(r'Телефон\:\s?.+') REG_MAIL = re.compile(r'E-mail\:\s?.+') REG_SITE = re.compile(r'Сайт\:\s?.+') source = ImportSources.objects.get(url='http://www.bigyalta.info/') org_phones = OrganizationPhones.objects.filter( organization__source_obj=source) phones_objs = {} for i in org_phones: phones_objs[i.phone] = i org_tags = OrganizationTags.objects.filter(organization__source_obj=source) tags_objs = {} for i in org_tags: tags_objs[i.name] = i org_streets = Street.objects.all() org_streets_dict = {} for i in org_streets: org_streets_dict[i.slug.encode('utf-8')] = i source_ids = list( Organization.objects.filter(source_obj=source).values_list('source_id', flat=True)) city_name = 'Ялта' city = City.objects.get(name__name=city_name, name__status=1) with open('%s/organizations_bigyalta.xml' % settings.API_DUMP_PATH, 'r') as f: data = BeautifulSoup(f.read(), from_encoding="utf-8") count = 0 for i in data.findAll('url'): count += 1 url = i['value'].encode('utf-8') title = i['name'].encode('utf-8') source_id = url.replace( 'http://www.bigyalta.info/business/index.php?show=', '').decode('utf-8') if source_id not in source_ids: req = urllib.urlopen(url) if req.getcode() == 200: data = BeautifulSoup(req.read(), from_encoding="utf-8") div = data.find( 'div', style='float:left; width:670px; padding-right:10px;') div = div.text.encode('utf-8') addr = REG_ADDR.findall(div) if addr: addr = addr[0].replace('Адрес:', '').replace('Ялта,', '').strip() street_name, street_type, house = get_org_street( addr.decode('utf-8')) if street_type: if street_name: street_slug = low(del_separator(street_name)) street_obj = org_streets_dict.get(street_slug) if not street_obj: street_obj = Street.objects.create( name=street_name, slug=street_slug, type=street_type) org_streets_dict[street_slug] = street_obj else: street_obj = None house = None building_obj = org_build_create( house, city, street_obj) # ТЕЛЕФОНЫ, САЙТ, МЭЙЛ phones_temp = REG_TEL.findall(div) email = REG_MAIL.findall(div) if email: email = email[0].replace('E-mail:', '').strip() if not email: email = None site = REG_SITE.findall(div) if site: site = site[0].replace('Сайт:', '').strip() if not site: site = None phones = [] if phones_temp: phones_temp = phones_temp[0].replace( 'Телефон:', '').replace(' ', '').replace('-', '').replace('–', '') phone = REG_PHONE.findall(phones_temp) if phone: phone = phone[0].decode('utf-8') phone_obj = phones_objs.get(phone) if not phone_obj: phone_obj = OrganizationPhones.objects.create( phone=phone) phones_objs[phone] = phone_obj phones.append(phone_obj) # МЕТКИ (КАТЕГОРИИ, ТЕГИ) tags = [] for cat in i.findAll('tag'): category_name = cat['value'] category_obj = tags_objs.get(category_name) if not category_obj: category_obj = OrganizationTags.objects.create( name=category_name) tags_objs[category_name] = category_obj tags.append(category_obj) org_obj = Organization.objects.create( name=title, site=site, email=email, note=None, source_obj=source, source_id=source_id, ) for j in phones: org_obj.phones.add(j) for j in tags: org_obj.tags.add(j) org_obj.buildings.add(building_obj) source_ids.append(source_id) if count % 10 == 0: time.sleep(random.uniform(1.0, 3.0)) return HttpResponse(str('finish'))