Exemple #1
0
def save_sms_sources(request):
    '''
    Парсер файла sms.txt
    '''
    # получаю объект для русского языка
    lang = Language.objects.get(pk=1)
    list_all = []
    sms_file = open(rel('sources/sms.txt'), 'r')
    # получаю данные из файла
    for line in sms_file.read().split('\n'):
        listt = []
        for i, l in enumerate(line.split('\t')):
            if i == 1: listt.append(capit(low(l)))
            elif i == 2: listt.append(l.split(' ')[0])
            elif i == 4: listt.append(l)
        # складываю данные в список
        list_all.append(listt)
    sms_file.close()
    # получаю объект для источника sms
    source = ImportSources.objects.get(source='SMS')
    # иду по списку с данными
    for l in list_all:
        try:
            if l[1] != 'ЗАКРЫТ':
                # очищаю название от спец.символов
                slug_city = low(del_separator(l[1]))
                # ищу по очищенному названию
                try: city = City.objects.get(name__name=slug_city)
                except City.DoesNotExist:
                    # если не найдено, то ищу по названию из источника
                    try: city = City.objects.get(name__name=l[1])
                    except City.DoesNotExist:
                        # если не найдено, то ищу по названию из источника в нижнем регистре
                        try: city = City.objects.get(name__name=capit(low(l[1])))
                        except City.DoesNotExist: city = None
                if city:
                    # очищаю название от спец.символов
                    slug_cinema = low(del_separator(l[0]))
                    # ищу по очищенному названию
                    try: cinema = Cinema.objects.get(name__name=slug_cinema, city=city.id)
                    except Cinema.DoesNotExist:
                        # если не найдено, то ищу по названию из источника
                        try: cinema = Cinema.objects.get(name__name=l[0], city=city.id)
                        except Cinema.DoesNotExist: cinema = None
                    if cinema:
                        # получаю/создаю залы для этого кинотеатра в этом городе
                        name1 = create_hallname(1, lang, 'без указания зала')
                        name2 = create_hallname(2, lang, 'безуказаниязала')
                        hall = create_hall((name1, name2), 0, 0, cinema)
                        # записываю url источника в БД, для последующего получения данных о сеансах
                        try: HallsSources.objects.get(id_hall=hall, source=source, url_hall_sources=l[2])
                        except HallsSources.DoesNotExist: HallsSources(id_hall=hall, source=source, url_hall_sources=l[2]).save()
                    else:
                        # если не найден кинотеатр, то запись в лог
                        logger(**{'event': 2, 'code': 2, 'bad_obj': l[0], 'obj1': l[1], 'obj2': l[2], 'extra': city.id})
                else:
                    # если не найден город, то запись в лог
                    logger(**{'event': 2, 'code': 1, 'bad_obj': capit(low(l[1])), 'obj2': l[2]})
        except IndexError: pass
    return HttpResponseRedirect(reverse("main_kai"))
Exemple #2
0
def person_create_func(name_ru, parental, name_en):
    person_obj = AfishaPersons.objects.using('afisha').create(
        birth_year = 0,
        birth_mounth = 0,
        birth_day = 0,
        male = 0,
        national = 0,
        country_id = 0,
        imdb = 0
    )

    person = Person.objects.create(kid = person_obj.id)

    names_list = [
        {'name': name_ru.strip(), 'status': 1, 'lang': 1}, 
        {'name': low(del_separator(name_ru.strip().encode('utf-8'))), 'status': 2, 'lang': 1}, 
        {'name': name_en.strip(), 'status': 1, 'lang': 2}, 
        {'name': low(del_separator(name_en.strip().encode('utf-8'))), 'status': 2, 'lang': 2}, 
        {'name': parental.strip(), 'status': 3, 'lang': 1},
    ]
    for i in names_list:
        if i['name']:
            if i['status'] == 1:
                try:
                    afisha_person_name_create(person_obj, i['name'], i['lang'])
                except db.backend.Database._mysql.OperationalError:
                    i['name'] = i['name'].encode('ascii', 'xmlcharrefreplace')
                    afisha_person_name_create(person_obj, i['name'], i['lang'])

            name, created = person_name_create(i['name'], i['lang'], i['status'])
            person.name.add(name)

    return person_obj
Exemple #3
0
def nowru_ident():
    source = ImportSources.objects.get(url='http://www.now.ru/')
    ignored = get_ignored_films()

    data_nof_film = ''
    nowru_data = Nowru.objects.filter(kid=None)

    for i in nowru_data:
        name_ru_slug = low(del_separator(i.name_ru.encode('utf-8')))
        if name_ru_slug.decode('utf-8') not in ignored:
            name_en_slug = low(del_separator(i.name_en.encode('utf-8')))
            kid, info = film_identification(name_ru_slug,
                                            name_en_slug, {}, {},
                                            year=i.year,
                                            source=source)
            if kid:
                i.kid = kid
                i.save()
            else:
                if 'slug="%s"' % name_ru_slug not in data_nof_film:
                    name_ru = i.name_ru.encode('utf-8')
                    name_en = i.name_en.encode('utf-8')
                    data_nof_film += xml_noffilm(name_ru, name_ru_slug,
                                                 name_en, name_en_slug,
                                                 i.nowru_id, info, None,
                                                 source.id)

    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_film)
    cron_success('xml', source.dump, 'players', 'Онлайн плееры')
Exemple #4
0
def raspishi_relations():
    source = ImportSources.objects.get(url='http://распиши.рф/')

    ignored = get_ignored_films()
    data_nof_film = ''

    domain = u'распиши.рф'
    url = 'http://%s/getfilmxml.php' % domain.encode('idna')

    req = urllib.urlopen(url)
    if req.getcode() == 200:
        films_rid = list(
            RaspishiRelations.objects.exclude(kid=0).values_list('rid',
                                                                 flat=True))

        xml_data = BeautifulSoup(req.read(), from_encoding="utf-8")
        for i in xml_data.findAll('movie'):
            id = int(i['id'])
            if id not in films_rid:
                name_ru = i.find('name').text.encode('utf-8')
                name_en = i.find('nameeng').text.encode('utf-8')

                name_ru = re.sub(r'\(.*?\)', '', name_ru).strip()
                name_en = re.sub(r'\(.*?\)', '', name_en).strip()

                name_slug = low(del_separator(del_screen_type(name_ru)))
                name_en_slug = low(del_separator(del_screen_type(name_en)))

                if name_slug.decode('utf-8') not in ignored:
                    try:
                        kid, info = film_identification(name_slug,
                                                        None, {}, {},
                                                        source=source)

                        if kid:
                            created = RaspishiRelations.objects.create(
                                rid=id,
                                kid=kid,
                                name_ru=name_ru,
                                name_en=name_en,
                            )
                        else:
                            data_nof_film += xml_noffilm(
                                name_ru, name_slug, name_en, name_en_slug, id,
                                info, None, source.id)
                    except db.backend.Database._mysql.OperationalError:
                        pass
    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_film)
    cron_success('xml', source.dump, 'films', 'Укр. сеансы')
Exemple #5
0
def get_cinemate_cc_film(data, source, ignored, noffilms):
    flist = []
    for div in data.findAll('div', {'class': "movie-brief"}):
        h3 = div.find('h3')
        a = h3.find('a')
        film_url = a.get('href')
        film_id = int(film_url.replace('/movie/', '').replace('/', ''))
        film_name = a.text.encode('utf-8')
        film_slug = low(del_separator(film_name))

        if film_slug.decode(
                'utf-8') not in ignored and film_id not in noffilms:
            full_url = '%s%s' % (source.url, film_url.lstrip('/'))
            film_year = int(
                h3.find('small').text.encode('utf-8').replace('(', '').replace(
                    ')', ''))

            next = False

            ul = div.find('ul')
            for link in ul.findAll('a'):
                a_txt = link.text.encode('utf-8').strip()
                if a_txt == 'Скачать':
                    next = True

            if next:
                flist.append({
                    'id': film_id,
                    'name': film_name,
                    'slug': film_slug,
                    'year': film_year,
                    'url': full_url,
                })
    return flist
Exemple #6
0
def imdb_search2(imdb_id, name, year, kid):
            film_name = name
            slug = low(del_separator(film_name.encode('utf-8')))
            film_name = film_name.encode('ascii', 'xmlcharrefreplace')
            xml = '<film n="%s" s="%s" y="%s" id="%s" d="" r=""></film>' % (film_name, slug, str(year).encode('utf-8'), str(imdb_id).encode('utf-8'))
            data = exp_film_data(imdb_id)
            
            if data:
                if data.get('double'):
                    return simplejson.dumps(data)
                else:
                    if not data['kid']:
                        pass
                    elif int(data['kid']) != int(kid):
                        return simplejson.dumps({'status': True, 'redirect': True, 'kid': data['kid']})
            
            data_nof_persons, distr_nof_data, dump, good = get_imdb_data(xml, False, 1, [int(imdb_id),], True, kid)
            
            if good:
                data = exp_film_data(imdb_id)
                if not data:
                    data = {'status': False}
            else:
                data = {'status': False}
            if kid:
                cache.delete_many(['get_film__%s' % kid, 'film__%s__fdata' % kid])
            return simplejson.dumps(data)
Exemple #7
0
def imdb_film_ident():
    source = ImportSources.objects.get(url='http://www.imdb.com/')
    
    films = Films.objects.filter(kid=None)
    films_ids = [i.imdb_id for i in films]

    exist_films = Film.objects.using('afisha').filter(idalldvd__in=films_ids)
    exist_ids = {}
    for i in exist_films:
        exist_ids[i.idalldvd] = i.id

    data_nof_film = ''
    
    for i in films:
        name = None
        for j in i.name.filter(status=1, language__id=2):
            name = j.name.encode('utf-8')
            
        slug = low(del_separator(name))
        
        kid = exist_ids.get(long(i.imdb_id))
        
        if kid:
            i.kid = kid
            i.save()
        else:
            full_url = '%stitle/tt%s/' % (source.url, i.imdb_id)
            data_nof_film += xml_noffilm(name, slug, None, None, i.imdb_id, 'Фильм не найден', full_url.encode('utf-8'), source.id)
            
    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'films_ident', 'Идентификация')
Exemple #8
0
def person_name_detect(request, ru, en):
    try:
        film_editor = is_film_editor(request)
        
        if film_editor:
            name = escape(strip_tags(ru)).encode('utf-8').strip()
            en = escape(strip_tags(en)).encode('utf-8').strip()

            slug_ru = low(del_separator(name))
            slug_en = low(del_separator(en))
            

            queries = []
            if name:
                queries.append(Q(name__icontains=slug_ru, status=1))
            if en:
                queries.append(Q(name__icontains=en, status=1))

            query = queries.pop()
            for item in queries:
                query |= item

            data = list(NamePerson.objects.filter(query, language__id__in=(1,2), person__kid__gt=0).values('language', 'person__kid', 'name'))

            names = {}
            for i in data:
                if not names.get(i['person__kid']):
                    names[i['person__kid']] = {'ru': '', 'en': '', 'id': i['person__kid']}
                if i['language'] == 1:
                    names[i['person__kid']]['ru'] = i['name']
                elif i['language'] == 2:
                    names[i['person__kid']]['en'] = i['name']
            
            names = sorted(names.values(), key=operator.itemgetter('ru'))

            txt = ''
            for i in names:
                txt += u'<div style="border-bottom:1px solid #CCC; padding:5px; background:#EBEBEB; min-width: 300px;"><a href="http://kinoinfo.ru/person/%s/" target="_blank">%s / %s</a></div>' % (i['id'], i['ru'], i['en'])
            if txt:
                txt = u'В базе есть похожие персоны:<br />%s' % txt

            return simplejson.dumps({
                'status': True,
                'content': txt,
            })
    except Exception as e:
        open('errors.txt','a').write('%s * (%s)' % (dir(e), e.args))
Exemple #9
0
def get_imdb_film_list():

    source = ImportSources.objects.get(url='http://www.imdb.com/')

    url = '%scalendar/?region=us' % source.url
    
    opener = give_me_cookie()
    req = opener.open(urllib2.Request(url))
    
    xml = ''
    ids = []
    if req.getcode() == 200:
        data = BeautifulSoup(req.read(), from_encoding="utf-8")
        div = data.find('div', id="main")
        old_date = ''
        for h4 in div.findAll('h4'):
            release = h4.string.encode('utf-8')
            day, month, year = release.split()
            
            month = get_month_en(low(month))
            
            rel_date = '%s-%s-%s' % (year, month, day)

            xml += '<date v="%s">' % rel_date
                
            ul = h4.find_next('ul')
            
            for li in ul.findAll('li'):
                year = li.find('span', {'class': "year_type"}).string.encode('utf-8')
                if 'documentary' not in low(year):
                    year = re.findall(r'\d+', year)
                    if year:
                        details = li.find('i')
                        if details:
                            details = str(details).encode('utf-8').replace('<i>','').replace('</i>','')
                            details = details.replace('(','').replace(')','')
                        else:
                            details = ''
                            
                        if 'limited' not in low(details) and 'fest' not in low(details) or 'tv premiere' not in low(details):
                            film_name = li.a.string.encode('utf-8').replace('"', '&quot;').replace('&','&amp;')
                            film_slug = low(del_separator(film_name))
                            full_url = li.a.get('href').encode('utf-8')
                            imdb_id = full_url.replace('/title/tt', '').replace('/', '')
                        
                            xml += '<film n="%s" s="%s" y="%s" id="%s" d="%s" r="%s"></film>' % (film_name, film_slug, year[0], imdb_id, details, rel_date)
                            ids.append(imdb_id)
                    
            xml += '</date>'
    ids = ';'.join(set(ids))
    xml = '<data><ids value="%s">%s</ids></data>' % (ids, xml)time

    create_dump_file('%s_film_list' % source.dump, settings.API_DUMP_PATH, xml)
    cron_success('html', source.dump, 'films_list', 'Список релизов')
Exemple #10
0
def imdb_person_search(request, pid, name, exist):
    try:
        from person.views import person_name_create
        from release_parser.imdb import imdb_person_searching

        if request.user.is_superuser:
            name = escape(strip_tags(name)).encode('utf-8').strip()
            slug = low(del_separator(name))

            person = Person.objects.get(pk=pid)

            # если не было имени (en), то создаю
            if not exist:
                if name:
                    exist = True

                    person_names = person.name.all()

                    names = [
                        {'name': name, 'status': 1},
                        {'name': slug, 'status': 2},
                    ]

                    for i in names:
                        name_obj, created = person_name_create(i['name'], i['status'], 2)
                        if name_obj not in person_names:
                            person.name.add(name_obj)

            if exist:
                result = imdb_person_searching(name)
                txt = ''

                for i in result:

                    txt += '<div style="border-bottom:1px solid #CCC; padding:5px; background:#EBEBEB; min-width: 300px;"><a href="http://www.imdb.com%s" target="_blank">%s</a> <i>%s</i><br /> <input type="button" value="Выбрать" id="%s" class="imdb_person_list_select" /></div>' % (i['link'].encode('utf-8'), i['title'], i['details'], i['id'])
                
            
                txt += '<br /><div>Или укажите ссылку на страницу персоны IMDb:<br /><input type="text" value="" size="40" class="imdb_person_url" /> <input type="button" value="Искать" class="imdb_person_list_select" /><input type="hidden" value="%s" id="pid" /></div>' % person.id
        
                return simplejson.dumps({
                    'status': True,
                    'content': txt,
                    'query': name,
                })

        return simplejson.dumps({})
    except Exception as e:
        open('errors.txt','a').write('%s * (%s)' % (dir(e), e.args))
Exemple #11
0
def get_okinoua_cities():
    """
    Парсинг городов Украины
    """
    source = ImportSources.objects.get(url='http://www.okino.ua/')

    # Получаем список городов с таблицы SourceCities в виде списка
    cities_ids = get_source_data(source, 'city', 'list')
    data_nof_city = ''

    # Открываем страницу с городами
    url = '%skinoafisha-kiev/' % source.url
    req = urllib.urlopen(url)
    if req.getcode() == 200:
        page = BeautifulSoup(req.read(), from_encoding="utf-8")
        # Находим все теги с городами и считываем из них id и названия городов
        for ul in page.findAll('ul', {'class': 'blist'}):
            for li in ul.findAll('li'):
                id = li.a.get('href').replace('/', '')
                name = li.a.string.encode('utf-8').strip()
                name_slug = low(del_separator(name))
                # Сравниваем полученные города с городами в нашей БД и, если НЕТ совпадений, то
                if id not in cities_ids:
                    # идентифицируем новый город
                    city = City.objects.filter(name__name=name_slug,
                                               name__status=2).distinct('pk')
                    # если идентифицировали, то записываем в таблицу SourceCities
                    if city.count() == 1:
                        SourceCities.objects.create(
                            source_id=id,
                            source_obj=source,
                            city=city[0],
                            name=name,
                        )
                    # в противном случаем записываем ненайденые города в xml для дальнейших действий над ними
                    else:
                        if 'slug="%s"' % name_slug not in data_nof_city:
                            data_nof_city += '<city name="%s" slug="%s"></city>' % (
                                name, name_slug)

    create_dump_file('okinoua_nof_city', settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_city)
    cron_success('html', 'okinoua', 'cities', 'Укр. города')
Exemple #12
0
def get_rambler_cities():
    source = ImportSources.objects.get(url='http://www.rambler.ru/')

    cities_ids = get_source_data(source, 'city', 'list')
    data_nof_city = ''
    '''
    # LOCALHOST
    f = open('%s/dump_rambler_city.xml' % settings.API_DUMP_PATH, 'r')
    xml = BeautifulSoup(f.read(), from_encoding="utf-8")
    f.close()
    if xml: # --- end localhost
    '''

    # SERVER
    url = 'http://api.kassa.rambler.ru/v2/%s/xml/cities/' % RAMBLER_API_KEY  # dump_rambler_city.xml
    req = urllib.urlopen(url)
    if req.getcode() == 200:
        xml = BeautifulSoup(req.read(),
                            from_encoding="utf-8")  # --- end server

        for i in xml.findAll('city'):
            id = i.cityid.string
            name = i.find('name').string.encode('utf-8')
            name_slug = low(del_separator(name))
            if id not in cities_ids:
                city = City.objects.filter(name__name=name_slug,
                                           name__status=2).distinct('pk')
                if city.count() == 1:
                    SourceCities.objects.create(
                        source_id=id,
                        source_obj=source,
                        city=city[0],
                        name=name,
                    )
                else:
                    if 'slug="%s"' % name_slug not in data_nof_city:
                        data_nof_city += '<city name="%s" slug="%s"></city>' % (
                            name, name_slug)

    create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_city)
    cron_success('xml', source.dump, 'cities', 'Города')
Exemple #13
0
def get_name_film_obj(film):
    '''
    Получение объекта названия фильма
    '''
    # очищаю названия от формата изображения (3D, 2D ...)
    f = del_screen_type(film)
    # очищаю названия от спец.символов и привожу в нижний регистр
    f = low(del_separator(f))
    # ищу по очищенному названию
    try: name = NameProduct.objects.filter(name=f)[0]
    except IndexError:
        # если не найден, ищу по названию источника
        try: name = NameProduct.objects.filter(name=film)[0]
        except IndexError:
            # если не найден, ищу по названию источника в нижнем регистре
            try: name = NameProduct.objects.filter(name=low(film))[0]
            except IndexError:
                # если не найден, ищу по названию источника в нижнем регистре с заглавной буквы
                try: name = NameProduct.objects.filter(name=capit(film))[0]
                except IndexError: name = None
    return name
Exemple #14
0
def get_premierzal_cities():
    source = ImportSources.objects.get(url='http://www.premierzal.ru/')

    cities = get_source_data(source, 'city', 'list')

    data_nof_city = ''

    req = urllib.urlopen(source.url)
    if req.getcode() == 200:
        data = BeautifulSoup(req.read())

        block = data.find('div', {'class': 'drop'})

        for i in block.findAll('a'):
            city_name = i.text.encode('utf-8').strip()
            city_id = low(del_separator(city_name))

            if city_id.decode('utf-8') not in cities:

                city = City.objects.filter(name__name=city_id,
                                           name__status=2).distinct('pk')

                if city.count() == 1:
                    SourceCities.objects.create(
                        source_id=city_id,
                        source_obj=source,
                        city=city[0],
                        name=city_name,
                    )
                else:
                    data_nof_city += '<city name="%s" slug="%s"></city>' % (
                        city_name, city_id)

                cities.append(city_id.decode('utf-8'))

    create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_city)
    cron_success('html', source.dump, 'cities', 'Города')
Exemple #15
0
def get_kinohod_cinemas():
    #    print "BEGIN get_kinohod_cinemas()"
    t1 = time.time()
    start_time = datetime.datetime.now().strftime('%H:%M:%S')

    cron_data_new = ''
    cron_data_nof = ''
    cron_count = 0

    main_url = 'http://www.kinohod.ru/api/rest/partner/v1/cinemas?apikey=%s' % SERVER_API_KEY

    source = ImportSources.objects.get(url='http://kinohod.ru/')
    kinohod_cinemas = get_source_data(source, 'cinema', 'list')
    kinohod_cities_dict = get_source_data(source, 'city', 'dict')

    cinemass = Cinema.objects.all()
    cinemass_dict = {}
    for i in cinemass:
        cinemass_dict[i.code] = i

    count = 0
    data_nof_cinema = ''
    for cid, kinohod_city in kinohod_cities_dict.iteritems():
        try:
            url = '%s&city=%s' % (main_url, cid)
            req = urllib.urlopen(url)
            if req.getcode() == 200:
                json_data = req.read()
                data = json.loads(json_data)
                for i in data:
                    cron_count += 1
                    id = str(i['id']).decode('utf-8')
                    if id not in kinohod_cinemas:
                        name = i['title']
                        name_slug = del_screen_type(name.encode('utf-8'))
                        name_slug = low(del_separator(name_slug))
                        short_name = i['shortTitle']
                        short_name_slug = del_screen_type(
                            short_name.encode('utf-8'))
                        short_name_slug = low(del_separator(short_name_slug))

                        filter1 = {
                            'name__name': name_slug,
                            'name__status': 2,
                            'city__id': kinohod_city.city_id
                        }
                        filter2 = {
                            'name__name': short_name_slug,
                            'name__status': 2,
                            'city__id': kinohod_city.city_id
                        }
                        cinema_kid = cinema_identification(
                            short_name_slug, filter1, filter2)
                        cin_obj = cinemass_dict.get(cinema_kid)
                        if cin_obj:
                            SourceCinemas.objects.create(
                                source_id=id,
                                source_obj=source,
                                city=kinohod_city,
                                cinema=cin_obj,
                                name=name,
                                name_alter=short_name,
                            )
                            cron_data_new += '%s<br />' % short_name.encode(
                                'utf-8')
                        else:
                            count += 1
                            name_city = kinohod_city.name
                            data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                short_name.encode('utf-8'), short_name_slug,
                                name_city.encode('utf-8'),
                                kinohod_city.city.kid)
                            cron_data_nof += '%s<br />' % short_name.encode(
                                'utf-8')
                        kinohod_cinemas.append(id)
        except IOError:
            open('%s/ddd.txt' % settings.API_DUMP_PATH,
                 'a').write(str(url) + '\n')
    data_nof_cinema += '<sum>%s</sum>' % count
    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)

    # cron log
    end_time = datetime.datetime.now().strftime('%H:%M:%S')
    cron_data = '%s | %s - %s %s\n' % (datetime.datetime.now().date(),
                                       start_time, end_time,
                                       'Импорт кинотеатров киноход')
    cron_data += '<br /><b>Обработано</b>: %s' % cron_count
    cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new
    cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof
    for i in range(50):
        cron_data += '- '
    process_time = time.time() - t1
    cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data)
    open('%s/cron_log_kinohod_cinemas.txt' % settings.CRON_LOG_PATH,
         'a').write(cron_data)
    cron_success('json', source.dump, 'cinemas', 'Кинотеатры')
Exemple #16
0
def orsk_streets_fix(request):

    source = ImportSources.objects.get(url='http://www.orgpage.ru/')

    orgs = list(
        Organization.objects.filter(source_obj=source).values_list('id',
                                                                   flat=True))

    builds = Building.objects.select_related('street').filter(
        organization__id__in=orgs)

    for ind, i in enumerate(builds):
        if ind not in (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                       16, 17, 18, 19):
            st_type = i.street.name
            st = st_type

            street_name = st_type
            street_type = '1'

            #return HttpResponse(str(st_type.encode('utf-8')))

            if u'Набережная' in st_type or u'набережная' in st_type:
                street_name = st.encode('utf-8').replace(
                    'Набережная', '').replace('набережная', '').strip()
                street_type = '4'
            elif u'шоссе' in st_type or u'ш.' in st_type:
                street_name = st.encode('utf-8').replace('шоссе', '').replace(
                    'ш.', '').strip()
                street_type = '5'
            elif u'пл.' in st_type or u'площадь' in st_type or u'Площадь' in st_type or u'плщ.' in st_type:
                street_name = st.encode('utf-8').replace(
                    'площадь',
                    '').replace('пл.', '').replace('плщ.',
                                                   '').replace('Площадь',
                                                               '').strip()
                street_type = '3'
            elif u'проезд' in st_type or u'Проезд' in st_type:
                street_name = st.encode('utf-8').replace('проезд', '').replace(
                    'Проезд', '').strip()
                street_type = '7'
            elif u'Парк' in st_type or u'парк' in st_type:
                street_name = st.encode('utf-8').replace('парк', '').replace(
                    'Парк', '').strip()
                street_type = '10'
            elif u'пер.' in st_type or u'переулок' in st_type or u'Пер.' in st_type:
                street_name = st.encode('utf-8').replace('пер.', '').replace(
                    'Пер.', '').replace('переулок', '').strip()
                street_type = '2'
            elif u'пр-кт' in st_type or u'проспект' in st_type or u'Пр.' in st_type or u'просп.' in st_type:
                street_name = st.encode('utf-8').replace('пр-кт', '').replace(
                    'проспект', '').replace('Пр.', '').replace('просп.',
                                                               '').strip()
                street_type = '6'
            elif u'ул.' in st_type or u'улица' or u'Ул.' in st_type:
                street_name = st.encode('utf-8').replace('ул.', '').replace(
                    'Ул.', '').replace('улица', '').strip()
                street_type = '1'

            street_name = street_name.replace('ул.',
                                              '').replace('улица', '').replace(
                                                  'Ул.', '').strip()

            name_slug = low(del_separator(street_name))

            i.street.name = street_name
            i.street.slug = name_slug
            i.street.type = street_type
            i.street.save()

    return HttpResponse(str())
Exemple #17
0
def get_orsk_organizations():

    source = ImportSources.objects.get(url='http://www.orgpage.ru/')

    org_phones = OrganizationPhones.objects.filter(
        organization__source_obj=source)
    phones_objs = {}
    for i in org_phones:
        phones_objs[i.phone] = i

    org_tags = OrganizationTags.objects.all()
    tags_objs = {}
    for i in org_tags:
        tags_objs[i.name] = i

    org_streets = Street.objects.all()
    org_streets_dict = {}
    for i in org_streets:
        org_streets_dict[i.slug.encode('utf-8')] = i

    source_ids = list(
        Organization.objects.filter(source_obj=source).values_list('source_id',
                                                                   flat=True))

    city_name = 'Орск'

    city = City.objects.get(name__name=city_name, name__status=1)
    '''
    # 1
    urls = [
        'http://www.orgpage.ru/орск_и_городской_округ_орск/администрация,_органы_исполнительной_власти/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/бизнес/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/досуг/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/жкх_и_благоустройство/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/культура/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/магазины/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/медицина/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/наука/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/образование/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/общественные_организации/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/посольства/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/промышленность/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/ремонт/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/организации_социального_комплекса/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/строительство/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/оптовая_торговля,_поставка_оборудования/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/транспорт/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/туризм_и_отдых/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/управление_и_контроль/',
        'http://www.orgpage.ru/орск_и_городской_округ_орск/услуги/',
    ]
    
    xml = ''
    
    count = 0
    for url in urls:
        req = urllib.urlopen(url)
        count += 1
        if req.getcode() == 200:
            data = BeautifulSoup(req.read(), from_encoding="utf-8")
            try:
                div = data.find('div', {'class': 'r_alphabet'})
                for link in div.findAll('li'):
                    name = link.a.text.encode('utf-8')
                    link = link.a.get('href').encode('utf-8')
                    xml += '<url link="%s" name="%s"></url>' % (link, name)
            except AttributeError: pass
            
        if count % 5 == 0:
            time.sleep(random.uniform(1.0, 3.0))
            
    open('organizations_orsk.xml', 'w').write(str('<data>%s</data>' % xml))
    '''
    '''
    # 2
    with open('%s/organizations_orsk.xml' % settings.API_DUMP_PATH, 'r') as f:
        data = BeautifulSoup(f.read(), from_encoding="utf-8")
    
    xml = ''
    count = 0
    for i in data.findAll('url'):
        url = '%s?order=date&onpage=100' % i['link'].encode('utf-8')
        tag = i['name'].encode('utf-8')
        
        req = urllib.urlopen(url)
        count += 1
        if req.getcode() == 200:
            data = BeautifulSoup(req.read(), from_encoding="utf-8")
            for a in data.findAll('a', {'class': 'name'}):
                name = a.text.encode('utf-8').replace('"', "'")
                link = a.get('href').encode('utf-8')
                
                xml += '<url link="%s" name="%s" tag="%s"></url>' % (link, name, tag)

        if count % 10 == 0:
            time.sleep(random.uniform(1.0, 3.0))
            
    open('organizations_orsk2.xml', 'w').write(str('<data>%s</data>' % xml))
    '''
    '''
    # 3
    with open('%s/organizations_orsk2.xml' % settings.API_DUMP_PATH, 'r') as f:
        data = BeautifulSoup(f.read(), from_encoding="utf-8")
    
    orgs = {}
    
    for i in data.findAll('url'):
        url = i['link'].encode('utf-8')
        name = i['name'].encode('utf-8')
        tag = i['tag'].encode('utf-8')
    
        if orgs.get(url):
            orgs[url]['tag'].append(tag)
        else:
            orgs[url] = {'tag': [tag], 'name': name, 'url': url}
    
    xml = ''
    
    for i in orgs.values():
        xml += '<url name="%s" url="%s">' % (i['name'], i['url'])
        for t in i['tag']:
            xml += '<tag name="%s"></tag>' % t
        xml += '</url>'

    open('organizations_orsk3.xml', 'w').write(str('<data>%s</data>' % xml))
    '''

    with open('%s/organizations_orsk3.xml' % settings.API_DUMP_PATH, 'r') as f:
        data = BeautifulSoup(f.read(), from_encoding="utf-8")

    count = 0
    for i in data.findAll('url'):
        count += 1

        url = i['url'].encode('utf-8')
        title = i['name'].encode('utf-8')

        uni = unidecode(i['name'])
        uni = re.findall(ur'[a-z0-9]+', low(uni))
        uni = '-'.join(uni) if uni else ''

        source_id = url.replace(
            'http://www.orgpage.ru/орск_и_городской_округ_орск/',
            '').decode('utf-8')

        if source_id not in source_ids:
            req = urllib.urlopen(url)

            if req.getcode() == 200:
                data = BeautifulSoup(req.read(), from_encoding="utf-8")
                address = data.find('span', itemprop='streetAddress')
                if address:
                    address = address.text.encode('utf-8')
                    street_name, street_type, house = get_org_street(
                        address.decode('utf-8'))
                    if street_type:
                        if street_name:
                            street_slug = low(del_separator(street_name))
                            street_obj = org_streets_dict.get(street_slug)
                            if not street_obj:
                                street_obj = Street.objects.create(
                                    name=street_name,
                                    slug=street_slug,
                                    type=street_type)
                                org_streets_dict[street_slug] = street_obj
                        else:
                            street_obj = None
                            house = None

                        building_obj = org_build_create(
                            house, city, street_obj)

                        # ТЕЛЕФОНЫ, САЙТ, МЭЙЛ
                        phones = []
                        for ph in data.findAll('span', itemprop='telephone'):
                            ph = ph.text.encode('utf-8').replace(
                                ' ', '').replace('-', '').replace('–', '')
                            phone = REG_PHONE.findall(ph)

                            if phone:
                                phone = phone[0].replace('(',
                                                         '').replace(')', '')
                                phone = phone.decode('utf-8')
                                phone_obj = phones_objs.get(phone)
                                if not phone_obj:
                                    phone_obj = OrganizationPhones.objects.create(
                                        phone=phone)
                                    phones_objs[phone] = phone_obj
                                phones.append(phone_obj)

                        site = None
                        site_block = data.find('li', id='list_sites')
                        if site_block:
                            site = site_block.find('a', itemprop='url')
                            if site:
                                site = site.text.encode('utf-8')

                        email = None
                        email = data.find('span', itemprop='email')
                        if email:
                            email = email.text.encode('utf-8')

                        # МЕТКИ (КАТЕГОРИИ, ТЕГИ)
                        tags = []
                        for cat in i.findAll('tag'):
                            category_name = cat['name']
                            category_obj = tags_objs.get(category_name)
                            if not category_obj:
                                category_obj = OrganizationTags.objects.create(
                                    name=category_name)
                                tags_objs[category_name] = category_obj
                            tags.append(category_obj)

                        org_obj = Organization.objects.create(
                            name=title,
                            site=site,
                            email=email,
                            note=None,
                            source_obj=source,
                            source_id=source_id,
                        )

                        org_obj.uni_slug = '%s-%s' % (uni, org_obj.id)
                        org_obj.save()

                        for j in phones:
                            org_obj.phones.add(j)

                        for j in tags:
                            org_obj.tags.add(j)

                        org_obj.buildings.add(building_obj)

                        source_ids.append(source_id)

        if count % 10 == 0:
            time.sleep(random.uniform(1.0, 3.0))

    return HttpResponse(str())
Exemple #18
0
def get_0654_organizations():

    source = ImportSources.objects.get(url='http://m.0654.com.ua/')

    org_phones = OrganizationPhones.objects.filter(
        organization__source_obj=source)
    phones_objs = {}
    for i in org_phones:
        phones_objs[i.phone] = i

    org_tags = OrganizationTags.objects.filter(organization__source_obj=source)
    tags_objs = {}
    for i in org_tags:
        tags_objs[i.name] = i

    org_streets = Street.objects.all()
    org_streets_dict = {}
    for i in org_streets:
        org_streets_dict[i.slug.encode('utf-8')] = i

    source_ids = list(
        Organization.objects.filter(source_obj=source).values_list('source_id',
                                                                   flat=True))

    city_name = 'Ялта'

    city = City.objects.get(name__name=city_name, name__status=1)

    with open('%s/organizations.xml' % settings.API_DUMP_PATH, 'r') as f:
        data = BeautifulSoup(f.read(), from_encoding="utf-8")

    temp = {}

    count = 0
    for i in data.findAll('url'):
        count += 1

        url = i['value'].encode('utf-8')
        source_id = url.replace('http://m.0654.com.ua/catalog/full/',
                                '').decode('utf-8')

        if source_id not in source_ids:
            req = urllib.urlopen(url)
            if req.getcode() == 200:
                data = BeautifulSoup(req.read(), from_encoding="utf-8")

                phones_temp = []
                phones = []
                streets = []
                tags = []
                email = None
                site = None

                # НАЗВАНИЕ
                title = data.find('h2').text.encode('utf-8').split(
                    '	')[0].strip()

                # ОПИСАНИЕ
                description = str(data.find('div',
                                            {'class': 'discription'})).replace(
                                                '<div class="discription">',
                                                '').replace('</div>',
                                                            '').strip()

                if not description:
                    description = None

                # АДРЕСА
                address = data.find('a', {'class': 'addr'})
                if address:
                    # если несколько адресов
                    address_temp = address.string.encode('utf-8').split(';')
                    for addr in address:

                        street_name, street_type, house = get_org_street(addr)

                        if street_type:
                            if street_name:
                                street_slug = low(del_separator(street_name))
                                street_obj = org_streets_dict.get(street_slug)
                                if not street_obj:
                                    street_obj = Street.objects.create(
                                        name=street_name,
                                        slug=street_slug,
                                        type=street_type)
                                    org_streets_dict[street_slug] = street_obj
                            else:
                                street_obj = None
                                house = None

                            building_obj = org_build_create(
                                house, city, street_obj)
                            streets.append(building_obj)
                else:
                    building_obj = org_build_create(None, city, None)
                    streets.append(building_obj)
                    street_type = True

                if street_type:
                    # ТЕЛЕФОНЫ, САЙТ, МЭЙЛ
                    table = data.find('table', {'class': 'har'})

                    for tr in table.findAll('tr'):
                        td = tr.findAll('td', limit=2)
                        if td[0].string == u'Телефон':
                            phones_temp = td[1].div.string.encode('utf-8')
                            phones_temp = phones_temp.replace(' ', '').replace(
                                '-', '').replace('–', '').split(';')
                        elif td[0].string == u'Email адрес':
                            email = td[1].div.a.string.encode('utf-8')
                        elif td[0].string == u'Сайт':
                            site = td[1].div.a.string.encode('utf-8')

                    for phone in phones_temp:
                        phone = REG_PHONE.findall(phone)
                        if phone:
                            phone = phone[0].decode('utf-8')
                            phone_obj = phones_objs.get(phone)
                            if not phone_obj:
                                phone_obj = OrganizationPhones.objects.create(
                                    phone=phone)
                                phones_objs[phone] = phone_obj
                            phones.append(phone_obj)

                    # МЕТКИ (КАТЕГОРИИ, ТЕГИ)
                    for cat in i.findAll('cat'):
                        category_name = cat['value']
                        category_obj = tags_objs.get(category_name)
                        if not category_obj:
                            category_obj = OrganizationTags.objects.create(
                                name=category_name)
                            tags_objs[category_name] = category_obj
                        tags.append(category_obj)

                    org_obj = Organization.objects.create(
                        name=title,
                        site=site,
                        email=email,
                        note=description,
                        source_obj=source,
                        source_id=source_id,
                    )

                    for j in phones:
                        org_obj.phones.add(j)

                    for j in tags:
                        org_obj.tags.add(j)

                    for j in streets:
                        org_obj.buildings.add(j)

                    source_ids.append(source_id)

                    if count % 10 == 0:
                        time.sleep(random.uniform(1.0, 3.0))

    return HttpResponse(str('finish'))
Exemple #19
0
def get_yovideo():
    source = ImportSources.objects.get(url='http://www.yo-video.net/')
    sfilm_clean(source)
    
    today = datetime.datetime.now()

    french_month = {
        '1': 'janvier',
        '2': 'fevrier',
        '3': 'mars',
        '4': 'avril',
        '5': 'mai',
        '6': 'juin',
        '7': 'juillet',
        '8': 'aout',
        '9': 'septembre',
        '10': 'octobre',
        '11': 'novembre',
        '12': 'decembre',
    }
    
    data_nof_film = ''
    noffilms = []
    
    ignored = get_ignored_films()

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)
    
    main_urls = []
    for i in range(today.month, 13):
        m = french_month.get(str(i))
        url = '%sfr/sorties/cinema/%s/%s/' % (source.url, today.year, m)

        req = urllib.urlopen(url)
        if req.getcode() == 200:
            data = BeautifulSoup(req.read(), from_encoding="utf-8")
            
            for h2 in data.findAll('h2'):
                day = h2.findAll('span', limit=1)[0].string.encode('utf-8')
                
                time.sleep(1)
                
                req2 = urllib.urlopen('%s%s' % (url, day))
                if req2.getcode() == 200:
                    data2 = BeautifulSoup(req2.read(), from_encoding="utf-8")
                    
                    release_date = datetime.date(today.year, int(i), int(day))
                    
                    for film_block in data2.findAll('div', {'class': 'sfilm'}):

                        film_id = film_block.find('a').get('href').encode('utf-8')
                        full_url = '%s%s' % (source.url, film_id.lstrip('/'))
                        
                        name = film_block.find('img').get('alt').encode('utf-8').replace('Film ', '')
                        slug = low(del_separator(name))
                        
                        if slug.decode('utf-8') not in ignored and film_id not in noffilms:
                        
                            obj = films.get(film_id)
                            next_step = checking_obj(obj)
                    
                            if next_step:
                                kid = None
                                if obj:
                                    kid = obj.kid
                                
                                if not kid:
                                    req3 = urllib.urlopen(full_url)
                                    
                                    if req3.getcode() == 200:
                                        data3 = BeautifulSoup(req3.read(), from_encoding="utf-8")

                                        h3 = data3.find('h3')
                                        
                                        alter_name = None
                                        alter_name_slug = None
                                            
                                        if h3:
                                            alter_name = h3.string.encode('utf-8')
                                            alter_name_slug = low(del_separator(alter_name))
                                        
                                        
                                        kid, info = film_identification(slug, alter_name_slug, {}, {}, source=source)

                                        txt = None
                                        if not kid:
                                            div = data3.find('div', {'class': "filmLeft"})
                                            img_url = div.find('img').get('src').encode('utf-8')
                        
                                            details = data3.find('div', {'class': "details"})
                                            director = details.find('span', itemprop="name")
                                            if director:
                                                director = director.string.encode('utf-8').strip()
                                            
                                            year = re.findall(ur'Année\s?\: \d+', details.text)
                                            if year:
                                                year = year[0].encode('utf-8').replace('Année','').replace(':','').strip()
                                            
                                            txt = '%s;%s;%s;%s' % (full_url.encode('utf-8'), img_url, director, year)
                                            kid = None
                                            
                                        
                                        objt = None
                                        
                                        if kid:
                                            create_new, objt = unique_func(fdict, kid, obj)
                                            if create_new:
                                                new = create_sfilm(film_id, kid, source, name, name_alt=alter_name, txt=txt, extra=release_date)
                                                films[film_id] = new
                                                if not fdict.get(kid):
                                                    fdict[kid] = {'editor_rel': [], 'script_rel': []}
                                                fdict[kid]['script_rel'].append(new)
                                        else:
                                            if not obj:
                                                new = create_sfilm(film_id, kid, source, name, name_alt=alter_name, txt=txt, extra=release_date)
                                                films[film_id] = new
                                                if not fdict.get(kid):
                                                    fdict[kid] = {'editor_rel': [], 'script_rel': []}
                                                fdict[kid]['script_rel'].append(new)

    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'releases', 'Франц.релизы')
Exemple #20
0
def get_kinohod_films():
    #    print "BEGIN get_kinohod_films()"

    ignored = get_ignored_films()

    t1 = time.time()
    start_time = datetime.datetime.now().strftime('%H:%M:%S')

    cron_data_new = ''
    cron_data_nof = ''
    cron_count = 0
    noffilms = []

    source = ImportSources.objects.get(url='http://kinohod.ru/')

    sfilm_clean(source)

    kinohod_cities = get_source_data(source, 'city', 'list')

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    data_nof_films = ''
    main_url = 'http://www.kinohod.ru/api/rest/partner/v1/movies?apikey=%s' % SERVER_API_KEY
    for city_id in kinohod_cities:
        try:
            url = '%s&city=%s' % (main_url, city_id)
            req = urllib.urlopen(url)

            if req.getcode() == 200:
                json_data = req.read()
                data = json.loads(json_data)
                for i in data:
                    cron_count += 1
                    film_id = str(i['id']).decode('utf-8')
                    year = int(
                        i['productionYear']) if i['productionYear'] else None
                    name_ru = i['title'].encode('utf-8')
                    name_ru_slug = low(del_separator(del_screen_type(name_ru)))
                    full_url = '%smovie/%s/' % (source.url, film_id)
                    name_en = None
                    name_en_slug = None
                    if i['originalTitle']:
                        name_en = i['originalTitle'].encode('utf-8')
                        name_en_slug = low(
                            del_separator(del_screen_type(name_en)))

                    if year and name_ru_slug.decode(
                            'utf-8'
                    ) not in ignored and film_id not in noffilms:

                        obj = films.get(film_id)
                        next_step = checking_obj(obj)

                        if next_step:
                            try:
                                if obj:
                                    kid = obj.kid
                                else:
                                    kid, info = film_identification(
                                        name_ru_slug,
                                        name_en_slug, {}, {},
                                        year=year,
                                        source=source)

                                objt = None
                                if kid:
                                    create_new, objt = unique_func(
                                        fdict, kid, obj)
                                    if create_new:
                                        new = create_sfilm(film_id,
                                                           kid,
                                                           source,
                                                           name_ru,
                                                           name_alt=name_en,
                                                           year=year)
                                        films[film_id] = new
                                        if not fdict.get(kid):
                                            fdict[kid] = {
                                                'editor_rel': [],
                                                'script_rel': []
                                            }
                                        fdict[kid]['script_rel'].append(new)
                                        cron_data_new += '%s<br />' % name_ru
                                elif not obj:
                                    if not name_en:
                                        name_en = '*'
                                        name_en_slug = '*'
                                    data_nof_films += xml_noffilm(
                                        name_ru, name_ru_slug,
                                        name_en, name_en_slug,
                                        film_id.encode('utf-8'), info,
                                        full_url.encode('utf-8'), source.id)
                                    noffilms.append(film_id)
                                    cron_data_nof += '%s<br />' % name_ru
                            except db.backend.Database._mysql.OperationalError:
                                pass

        except IOError:
            open('%s/ddd.txt' % settings.API_DUMP_PATH,
                 'a').write(str(url) + '\n')
    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_films)

    # cron log
    end_time = datetime.datetime.now().strftime('%H:%M:%S')
    cron_data = '%s | %s - %s %s\n' % (datetime.datetime.now().date(),
                                       start_time, end_time,
                                       'Импорт фильмов киноход')
    cron_data += '<br /><b>Обработано</b>: %s' % cron_count
    cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new
    cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof
    for i in range(50):
        cron_data += '- '
    process_time = time.time() - t1
    cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data)
    open('%s/cron_log_kinohod_films.txt' % settings.CRON_LOG_PATH,
         'a').write(cron_data)
    cron_success('json', source.dump, 'films', 'Фильмы')
Exemple #21
0
def get_kinohod_cities():
    #    print "BEGIN get_kinohod_cities()"
    t1 = time.time()
    start_time = datetime.datetime.now().strftime('%H:%M:%S')

    cron_data_new = ''
    cron_data_nof = ''
    cron_count = 0

    url = 'http://www.kinohod.ru/api/rest/partner/v1/cities?apikey=%s' % SERVER_API_KEY

    source = ImportSources.objects.get(url='http://kinohod.ru/')

    req = urllib.urlopen(url)
    if req.getcode() == 200:
        kinohod_cities = get_source_data(source, 'city', 'list')

        data_nof_city = ''
        json_data = req.read()
        data = json.loads(json_data)
        for i in data:
            cron_count += 1
            id = str(i['id']).decode('utf-8')

            if id not in kinohod_cities:
                alias = i['alias']
                name = i['name'].encode('utf-8')
                name_slug = del_screen_type(low(del_separator(name)))

                city = City.objects.filter(name__name=name_slug,
                                           name__status=2).distinct('pk')
                if city.count() == 1:
                    SourceCities.objects.create(
                        source_id=id,
                        source_obj=source,
                        city=city[0],
                        name=name,
                        name_alter=alias,
                    )
                    cron_data_new += '%s<br />' % name
                else:
                    data_nof_city += '<city name="%s" slug="%s"></city>' % (
                        name, name_slug)
                    cron_data_nof += '%s<br />' % name
                kinohod_cities.append(id)

        create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                         '<data>%s</data>' % data_nof_city)

    # cron log
    end_time = datetime.datetime.now().strftime('%H:%M:%S')
    cron_data = '%s | %s - %s %s<br />' % (datetime.datetime.now().date(),
                                           start_time, end_time,
                                           'Импорт городов киноход')
    cron_data += '<br /><b>Обработано</b>: %s' % cron_count
    cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new
    cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof
    for i in range(50):
        cron_data += '- '
    process_time = time.time() - t1
    cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data)
    open('%s/cron_log_kinohod_cities.txt' % settings.CRON_LOG_PATH,
         'a').write(cron_data)
    cron_success('json', source.dump, 'cities', 'Города')
Exemple #22
0
def search(request):

    query_orig = request.GET.get('query', '')
    category = request.GET.get('filter', '')
    objs = []
    msg = ''
    element = ''
    data = {}
    count = 0
    
    if query_orig:
        
        query = low(query_orig)
        
        slug = del_separator(query)
        if len(query) > 2:
            
            # фильмы
            if category == '1':
                data = []
                tmp_query = query.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
                element = 'фильмов'
                try:

                    '''
                    Получаем ID фильмов по поисковому запросу
                    upd : проверка ч/з пересечение множеств показала,
                    что такой вариант идентичен предыдущему(конкатенации двух сетов)
                    upd : убрал поиск по слагу, т.к. формирование слага происходит посредством склейки
                    всех слов из тайтла, а это приводит к нерелевантным результатам поиска
                    (см. "San tau jin zi lei saam" фильм)
                    '''

                    ids = set(list(
                        FilmsName.objects.using('afisha').filter(
                            Q(name__icontains = query),
                            status=1
                        ).values_list('film_id', flat=True)))


                    objs = FilmsName \
                        .objects \
                        .using('afisha') \
                        .only(
                            'name',
                            'film_id',
                            'film_id__description',
                            'film_id__year',
                            'film_id__genre1',
                            'film_id__genre2',
                            'film_id__imdb',
                              ) \
                        .select_related('film_id') \
                        .filter(film_id__id__in=ids, status=1, type=2) \
                        .order_by('-film_id__year','name')



                    tmp = None
                    tmp_first = []
                    relevant = []
                    nonrelevant = []

                    for i in objs:

                        '''
                        Получаем постер
                        '''

                        temp_poster = Objxres.objects.using('afisha').select_related('extresid').filter(
                            objtypeid__in=[301, 300],
                            objpkvalue=i.film_id_id)

                        posters = {}
                        for p in temp_poster:
                            if posters.get(p.objpkvalue):
                                if p.objtypeid == 301:
                                    posters[p.objpkvalue]['poster'].append(p)
                                else:
                                    posters[p.objpkvalue]['slides'].append(p)
                            else:
                                posters[p.objpkvalue] = {'poster': [], 'slides': []}
                                if p.objtypeid == 301:
                                    posters[p.objpkvalue]['poster'].append(p)
                                else:
                                    posters[p.objpkvalue]['slides'].append(p)

                        if posters and posters[i.film_id_id]['poster']:
                            i.poster = film_poster2(posters[i.film_id_id]['poster'], 'big')

                        '''
                        Получаем мету
                        '''
                        meta = Film.objects.using('afisha').get(id=i.film_id_id)

                        '''
                        Из полученных метаданных получаем наименования жанров
                        '''
                        i.genres = AfishaGenre.objects.using('afisha').filter(
                            pk__in=[meta.genre1_id, meta.genre2_id, meta.genre3_id])

                        '''
                        Из полученных метаданных получаем рейтинг по IMDB
                        грязнохак - шаблонизатор django не понимает числовых циклов с условиями
                        '''
                        if(meta.imdb):
                            i.imdb = {}
                            imdb_val = meta.imdb
                            i.imdb['range'] = range(
                                0,
                                int(
                                    math.ceil(
                                        float(
                                            imdb_val.replace(',', '.')
                                        )
                                    )
                                )
                                ,1)
                            i.imdb['value'] = meta.imdb

                        '''
                        Получаем рейтинг взвешенный по системе
                        '''
                        int_rate, show_ir, show_imdb, rotten = check_int_rates(i.film_id_id)
                        i.rating = {'rate': int_rate, 'show_ir': show_ir, 'show_imdb': show_imdb, 'rotten': rotten}

                        '''
                        upd: переработка сортировки по релевантности, признак - наличие поисковой фразы в описании
                        '''
                        tmp = i.film_id_id

                        if(tmp not in tmp_first):

                            if(
                                        (meta.description is not None)
                                    and (meta.description.find(tmp_query))
                            ):
                                relevant.append(i)
                            else:
                                nonrelevant.append(i)

                            tmp_first.append(tmp)

                    data = relevant+nonrelevant

                    count = len(tmp_first)

                    if count == 1:
                        return HttpResponseRedirect(reverse('get_film', kwargs={'id': tmp}))

                except db.backend.Database._mysql.OperationalError:
                    pass
            # персоны
            elif category == '2':
                data = {'first': [], 'middle': [], 'last': []}
                
                element = 'персон'
                
                tmp_query = query.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
                
                ids1 = set(list(NamePerson.objects.exclude(person__kid=None).filter(Q(name__iexact=tmp_query) | Q(name__istartswith=slug), status__in=(1,2)).values_list('person__id', flat=True)))
                
                ids2 = set(list(NamePerson.objects.exclude(person__id__in=ids1, person__kid=None).filter(name__icontains=slug, status=2).values_list('person__id', flat=True)))
                
                ids = set(list(ids1) + list(ids2))
                
                objs1 = list(NamePerson.objects.filter(person__id__in=ids, status=1, language__id=1).order_by('name').values('person__id', 'name', 'person__kid'))
                
                result_ids = set([i['person__id'] for i in objs1])
                
                objs2 = list(NamePerson.objects.exclude(person__id__in=result_ids).filter(person__id__in=ids, status=1, language__id=2).order_by('name').values('person__id', 'name', 'person__kid'))
                
                
                tmp = None
                    
                tmp_first = []
                
                for objs in [objs1, objs2]:
                    for i in objs:
                        tmp = i['person__kid']
                        if long(i['person__id']) in ids1:
                            if low(i['name'].encode('utf-8')) == tmp_query:
                                if tmp not in tmp_first:
                                    data['first'].append((i['person__kid'], i['name']))
                                    tmp_first.append(tmp)
                            else:
                                if tmp not in tmp_first:
                                    data['middle'].append((i['person__kid'], i['name']))
                                    tmp_first.append(tmp)
                        else:
                            if tmp not in tmp_first:
                                data['last'].append((i['person__kid'], i['name']))
                                tmp_first.append(tmp)
         
                count = len(tmp_first)
                
                if count == 1:
                    return HttpResponseRedirect(reverse('get_person', kwargs={'id': tmp}))
            
            elif category == '3':
                # music
                if request.subdomain == 'music':
                    data = {'first': [], 'middle': [], 'last': [], 'artist': []}
                    
                    element = ''
                    
                    tmp_query = query.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
                    
                    ids1 = set(list(Composition.objects.filter(Q(name__name__iexact=tmp_query) | Q(name__name__istartswith=slug), name__status__in=(2,5)).values_list('id', flat=True)))
                    
                    ids2 = set(list(Composition.objects.exclude(pk__in=ids1).filter(name__name__icontains=slug,  name__status__in=(2,5)).values_list('id', flat=True)))
                    
                    ids = set(list(ids1) + list(ids2))
                    
                    comp_rels = {}
                    for i in CompositionPersonRel.objects.filter(composition__pk__in=ids, type__name='исполнение').values('person', 'person__name__name', 'composition'):
                        comp_rels[i['composition']] = {'pid': i['person'], 'pname': i['person__name__name']}
                    
                    objs1 = list(CompositionName.objects.filter(composition__pk__in=ids, status=2).order_by('name').values('composition__id', 'name'))
                    
                    tmp = None
                        
                    tmp_first = []
                    
                    for i in objs1:
                        tmp = i['composition__id']
                        if long(i['composition__id']) in ids1:
                            if low(i['name'].encode('utf-8')) == query:
                                if tmp not in tmp_first:
                                    artsit = comp_rels.get(i['composition__id'])
                                    data['first'].append((i['composition__id'], i['name'], artsit))
                                    tmp_first.append(tmp)
                            else:
                                if tmp not in tmp_first:
                                    artsit = comp_rels.get(i['composition__id'])
                                    data['middle'].append((i['composition__id'], i['name'], artsit))
                                    tmp_first.append(tmp)
                        else:
                            if tmp not in tmp_first:
                                artsit = comp_rels.get(i['composition__id'])
                                data['last'].append((i['composition__id'], i['name'], artsit))
                                tmp_first.append(tmp)
             
                    count = len(tmp_first)

                    artists = Person.objects.filter(Q(name__name__iexact=tmp_query) | Q(name__name__istartswith=slug) | Q(name__name__icontains=slug), artist=True, name__status=4).values('id', 'name__name')

                    data['artist'] = artists
                    
                    count += artists.count()
                # Кинотеатры
                else:

                    element = 'кинотеатров'
                    
                    tmp_query = query.decode('utf-8').encode('ascii', 'xmlcharrefreplace')
                    
                    ids1 = set(list(Organization.objects.exclude(kid=None).filter(Q(name__iexact=tmp_query) | Q(name__istartswith=slug)).values_list('id', flat=True)))
                    ids2 = set(list(Organization.objects.exclude(Q(id__in=ids1) | Q(kid=None)).filter(name__icontains=slug).values_list('id', flat=True)))
                    
                    ids = set(list(ids1) + list(ids2))

                    buildings = list(Building.objects.filter(organization__id__in=ids, city__name__status=1).values('city', 'city__name__name', 'city__country', 'city__country__name', 'city__kid', 'organization', 'organization__name', 'organization__uni_slug'))

                    
                    cities = {}
                    count = 0
                    tmp = None
                    for i in buildings:
                        count += 1
                        tmp = i['organization__uni_slug']
                        if not cities.get(i['city']):
                            cities[i['city']] = {
                                'id': i['city'], 
                                'name': i['city__name__name'], 
                                'country': i['city__country__name'], 
                                'cinemas': []
                            }

                        cities[i['city']]['cinemas'].append({'name': i['organization__name'], 'slug': i['organization__uni_slug']})

                    cities = sorted(cities.values(), key=operator.itemgetter('name'))


                    data = {}
                    for i in cities:
                        if not data.get(i['country']):
                            data[i['country']] = []
                        data[i['country']].append(i)
                    
                    if count == 1:
                        return HttpResponseRedirect(reverse('organization_cinema', kwargs={'id': tmp}))

        else:
            msg = 'Слишком короткий запрос'
    
    return render_to_response('search_result.html', {'objs': data, 'element': element, 'msg': msg, 'srch_category': category, 'query': query_orig, 'count': count},  context_instance=RequestContext(request))
Exemple #23
0
def parse_data_ident(request, selected):
    """Функция для идентификации полученных записей
    """
    #try:
    debug_logs("start ident %s " % selected)
    # Начинаем отчет времени выполнения фукнции
    start = time.time()

    data_nof_film = ''
    noffilms = []

    ignored = get_ignored_films()

    # Задаем тип идентификации, для передачи в качестве параметра в функцию идентификации
    ident_type = 'movie_online'

    # Делаем выборку всех фильмов из базы с пометкой (afisha_id=None),
    # так помечаются все фильмы при парсинге,
    # это фильмы, которые не разу не проходифшие идентификацию киноафиши
    data = MovieMegogo.objects.filter(afisha_id__in=(0, None))

    # Получаем необходимые для идентификации параметры,
    # проходим итерациями в цикле для каждого отдельного фильма
    for i in data:

        year = i.year
        name_ru = i.title
        name_en = i.title_en
        country = i.country

        # Отчищаем названия ru en для идентификации фильма
        name_ru_slug = del_separator(low(name_ru))
        name_en_slug = del_separator(low(name_en))

        # Задаем диапазон лет для идентификации фильма
        new_year = year + 2
        old_year = year - 2
        filter_year = {'year__gte': old_year, 'year__lte': new_year}

        try:
            # Передаем фильм в функцию на идентификацию
            kid, info = film_identification(name_ru_slug, name_en_slug, {}, {},
                                            filter_year, ident_type, country)

            if kid:
                # Записываем результат в модель
                i.afisha_id = kid
                i.save()
            else:
                if i.megogo_id not in noffilms and name_ru_slug.decode(
                        'utf-8') not in ignored:
                    data_nof_film += xml_noffilm(name_ru.encode('utf-8'),
                                                 name_ru_slug, None, None,
                                                 i.megogo_id, info,
                                                 i.page.encode('utf-8'))
                    noffilms.append(i.megogo_id)
        except db.backend.Database._mysql.OperationalError:
            if i.megogo_id not in noffilms and name_ru_slug.decode(
                    'utf-8') not in ignored:
                data_nof_film += xml_noffilm(name_ru.encode('utf-8'),
                                             name_ru_slug, None, None,
                                             i.megogo_id, None,
                                             i.page.encode('utf-8'))
                noffilms.append(i.megogo_id)

        # Время выполнения функции
        finish = time.time()
        timer = "%.2f мин" % ((float(finish - start)) / 60)

        debug_logs("finish")
        debug_logs("timer: %s " % timer)
        debug_logs("Идентификация: название %s / инфо %s %s" %
                   (name_ru_slug, kid, info))

    source = ImportSources.objects.get(url='http://megogo.net/')
    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_film)

    # Возвращаемся в интерфейс
    return simplejson.dumps({
        'request_type': 1,
        'timer': timer,
    })
Exemple #24
0
def get_megamag():
    '''
    Получение urls фильмов
    '''
    import cookielib

    def give_me_cookie():
        cookie = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie),
                                      urllib2.HTTPHandler())
        return opener

    ignored = get_ignored_films()

    ignored_cinemas = get_ignored_cinemas()

    source = ImportSources.objects.get(url='http://megamag.by/')
    sfilm_clean(source)

    megamag_cities_dict = get_source_data(source, 'city', 'dict')
    megamag_cinemas_dict = get_source_data(source, 'cinema', 'dict')

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    cities_data = {}

    data_nof_films = ''
    data_nof_cinema = ''
    data_nof_city = ''
    noffilms = []
    schedules_data = []

    opener = give_me_cookie()
    req = opener.open(urllib2.Request('http://kinoteatr.megamag.by/index.php'))
    event_dict = {}
    if req.getcode() == 200:
        data = BeautifulSoup(req.read(), from_encoding="utf-8")

        cities = data.find('div', id="box-region")

        for i in cities.findAll('a'):

            city_name = i.text.encode('utf-8')
            city_slug = low(del_separator(city_name))
            city_id = i.get('href').replace(
                'http://kinoteatr.megamag.by/index.php?region_id=', '')

            mcity = megamag_cities_dict.get(city_id)

            if not mcity:
                city = City.objects.filter(name__name=city_slug,
                                           name__status=2).distinct('pk')
                if city.count() == 1:
                    mcity = SourceCities.objects.create(
                        source_id=city_id,
                        source_obj=source,
                        city=city[0],
                        name=city_name,
                    )
                else:
                    if 'slug="%s"' % city_slug not in data_nof_city:
                        data_nof_city += '<city name="%s" slug="%s"></city>' % (
                            city_name, city_slug)

            if mcity:
                cities_data[city_name] = mcity

        try:
            cinemas_tag = data.findAll('td',
                                       {'class': 'Cinema_new_box_1_BoxText'},
                                       limit=1)[0]
        except IndexError:
            cinemas_tag = None

        if cinemas_tag:
            for i in cinemas_tag.findAll('a'):
                cinema_url = i.get('href')
                cinema_id = cinema_url.replace(
                    'http://kinoteatr.megamag.by/index.php?cPath=', '')
                cinema_obj = megamag_cinemas_dict.get(cinema_id)

                opener = give_me_cookie()
                try:
                    req2 = opener.open(urllib2.Request(cinema_url))

                    if req2.getcode() == 200:
                        schedules_page = BeautifulSoup(req2.read(),
                                                       from_encoding="utf-8")
                        city_name = schedules_page.findAll(
                            'div', {'class': 'object_param_value'},
                            limit=1)[0].text.encode('utf-8')

                        city_obj = cities_data.get(city_name)
                        if city_obj:
                            cinema_name = schedules_page.find(
                                'div', {
                                    'class': 'object_title'
                                }).text.encode('utf-8')
                            cinema_name = cinema_name.replace('"', '').replace(
                                'Кинотеатр', '')
                            cinema_slug = low(del_separator(cinema_name))

                            cinema_ig_id = u'%s__%s' % (
                                cinema_slug.decode('utf-8'), city_obj.city.kid)

                            if cinema_ig_id not in ignored_cinemas:

                                if not cinema_obj:
                                    filter1 = {
                                        'name__name': cinema_slug,
                                        'name__status': 2,
                                        'city': city_obj.city
                                    }
                                    cinema_kid = cinema_identification(
                                        cinema_slug, filter1)
                                    if cinema_kid:
                                        try:
                                            cinema = Cinema.objects.get(
                                                code=cinema_kid)
                                            cinema_obj = SourceCinemas.objects.create(
                                                source_id=cinema_id,
                                                source_obj=source,
                                                city=city_obj,
                                                cinema=cinema,
                                                name=cinema_name,
                                            )
                                        except Cinema.DoesNotExist:
                                            pass
                                else:
                                    cinema_kid = cinema_obj.cinema.code

                                if cinema_kid:
                                    for event in schedules_page.findAll(
                                            'td', {'class': 'eventsHeading'}):
                                        if event.a.get('name'):
                                            ev = event.a['name'].split('_')[1]
                                            fname = event.a.text.encode(
                                                'utf-8')
                                            fid = event.a.get('href').replace(
                                                'http://kinoteatr.megamag.by/newsdesk_info.php?newsdesk_id=',
                                                '')
                                            event_dict[int(ev)] = {
                                                'name': fname,
                                                'id': int(fid)
                                            }

                                    links = []
                                    for td in schedules_page.findAll(
                                            'td', {'class': 'main'}):
                                        for link in td.findAll('a'):
                                            l = link.get('href')
                                            if l and 'cPath' in l:
                                                links.append(l)
                                    schedules_data.append({
                                        'mcity':
                                        city_obj,
                                        'city':
                                        city_obj.city,
                                        'mcinema':
                                        cinema_obj,
                                        'cinema':
                                        cinema_kid,
                                        'schedules':
                                        set(links)
                                    })
                                else:
                                    if 'slug="%s"' % cinema_slug not in data_nof_cinema:
                                        data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                            cinema_name, cinema_slug,
                                            city_name, city_obj.city.kid)
                except httplib.HTTPException:
                    pass
        create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                         '<data>%s</data>' % data_nof_city)
        create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                         '<data>%s</data>' % data_nof_cinema)

        megamag = get_source_data(source, 'schedule', 'list')

        for obj in schedules_data:
            cinema_object = obj['mcinema']

            for index, i in enumerate(obj['schedules']):
                opener = give_me_cookie()
                try:
                    req3 = opener.open(urllib2.Request(i))
                    if req3.getcode() == 200:

                        id_schedule = i.replace(
                            'http://kinoteatr.megamag.by/index.php?cPath=',
                            '').encode('utf-8')
                        if id_schedule not in megamag:
                            sch_page = BeautifulSoup(req3.read(),
                                                     from_encoding="utf-8")

                            tables = sch_page.findAll('table', {
                                'class':
                                'Cinema_new_box_2_TemplateCenterPart'
                            },
                                                      limit=1)[0]
                            main_table = tables.findAll('table',
                                                        cellpadding='4',
                                                        limit=1)[0]
                            tr = main_table.findAll('tr')[1]
                            td = tr.findAll('strong')

                            event_id = id_schedule.split('_')[2]
                            film_data = event_dict.get(int(event_id))
                            if film_data:
                                film_name = film_data['name']
                                film_name_slug = low(
                                    del_separator(del_screen_type(film_name)))
                                film_id = film_data['id']

                                if film_id not in noffilms and film_name_slug.decode(
                                        'utf-8') not in ignored:

                                    obj = films.get(
                                        str(film_id).decode('utf-8'))
                                    next_step = checking_obj(obj)

                                    if next_step:
                                        if obj:
                                            kid = obj.kid
                                        else:
                                            kid, info = film_identification(
                                                film_name_slug,
                                                None, {}, {},
                                                source=source)

                                        objt = None
                                        if kid:
                                            create_new, objt = unique_func(
                                                fdict, kid, obj)
                                            if create_new:
                                                objt = create_sfilm(
                                                    film_id, kid, source,
                                                    film_name)
                                                films[str(film_id).decode(
                                                    'utf-8')] = objt
                                                if not fdict.get(kid):
                                                    fdict[kid] = {
                                                        'editor_rel': [],
                                                        'script_rel': []
                                                    }
                                                fdict[kid][
                                                    'script_rel'].append(objt)
                                        elif not obj:
                                            data_nof_films += xml_noffilm(
                                                film_name, film_name_slug,
                                                None, None, film_id, info,
                                                None, source.id)
                                            noffilms.append(film_id)

                                        if objt:
                                            dtime_info = td[1].text.encode(
                                                'utf-8').split()
                                            year_info = datetime.datetime.now(
                                            ).year
                                            day_info = int(dtime_info[0])
                                            month_low = low(
                                                dtime_info[1].replace(',', ''))
                                            month_info = int(
                                                get_month(month_low))
                                            time_info = dtime_info[-1].replace(
                                                '(', '').replace(')',
                                                                 '').split(':')

                                            dtime = datetime.datetime(
                                                year_info, month_info,
                                                day_info, int(time_info[0]),
                                                int(time_info[1]), 0)
                                            SourceSchedules.objects.create(
                                                source_id=id_schedule,
                                                source_obj=source,
                                                cinema=cinema_object,
                                                film=objt,
                                                dtime=dtime,
                                            )
                except httplib.HTTPException:
                    open('%s/httplib_errors.txt' % settings.API_DUMP_PATH,
                         'a').write('%s\n' % i)
                # на каждом 60 обращении к источнику делаю паузу в 2 секунды
                if (index + 1) % 60 == 0:
                    time.sleep(2.0)

    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_films)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
Exemple #25
0
def get_premierzal_schedules():
    data_nof_film = ''
    noffilms = []

    ignored = get_ignored_films()

    source = ImportSources.objects.get(url='http://www.premierzal.ru/')
    sfilm_clean(source)

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    schedules = get_source_data(source, 'schedule', 'list')

    cities_cinemas = {}
    for i in SourceCinemas.objects.select_related('city').filter(
            source_obj=source):
        if not cities_cinemas.get(i.city.source_id):
            cities_cinemas[i.city.source_id] = {'city': i.city, 'cinemas': []}
        cities_cinemas[i.city.source_id]['cinemas'].append(i)

    for k, v in cities_cinemas.iteritems():
        city_url_encode = urllib.quote(v['city'].name.encode('utf-8'))
        for i in v['cinemas']:
            main_url = '%s?theatre=%s&city=%s' % (source.url, i.source_id,
                                                  city_url_encode)

            main_req = urllib.urlopen(main_url)
            if main_req.getcode() == 200:
                data = BeautifulSoup(main_req.read())
                data = data.find('div', id="films-list")

                if data:
                    dates = []
                    for calendar in data.findAll('table',
                                                 {'class': 'calendar'}):
                        for a in calendar.findAll('a'):
                            href = a.get('href', '')
                            href_dict = dict(cgi.parse_qsl(href))
                            calendar_date = href_dict.get(
                                u'?date', href_dict.get(u'date'))
                            if calendar_date:
                                dates.append({
                                    'date': calendar_date,
                                    'href': href
                                })

                    for ind, d in enumerate(dates):
                        films_blocks = []
                        if ind == 0:
                            films_blocks = data.findAll(
                                'div', {'class': 'film-item-wrapper'})
                        else:
                            url = '%s?date=%s&city=%s&theatre=%s' % (
                                source.url, d['date'], city_url_encode,
                                i.source_id)

                            req = urllib.urlopen(url)
                            if req.getcode() == 200:
                                data = BeautifulSoup(req.read())
                                data = data.find('div', id="films-list")
                                films_blocks = data.findAll(
                                    'div', {'class': 'film-item-wrapper'})
                            time.sleep(random.uniform(0.8, 2.2))

                        for block in films_blocks:
                            title = block.find('div', {
                                'class': 'title'
                            }).find('a')

                            film_name = title.text.encode('utf-8').strip()
                            film_slug = low(
                                del_separator(del_screen_type(film_name)))
                            film_id = film_slug

                            if film_id not in noffilms and film_slug.decode(
                                    'utf-8') not in ignored:

                                obj = films.get(film_id.decode('utf-8'))
                                next_step = checking_obj(obj)

                                if next_step:
                                    if obj:
                                        kid = obj.kid
                                    else:
                                        kid, info = film_identification(
                                            film_slug,
                                            None, {}, {},
                                            source=source)

                                    objt = None
                                    if kid:
                                        create_new, objt = unique_func(
                                            fdict, kid, obj)
                                        if create_new:
                                            objt = create_sfilm(
                                                film_id, kid, source,
                                                film_name)
                                            films[film_id.decode(
                                                'utf-8')] = objt
                                            if not fdict.get(kid):
                                                fdict[kid] = {
                                                    'editor_rel': [],
                                                    'script_rel': []
                                                }
                                            fdict[kid]['script_rel'].append(
                                                objt)
                                    elif not obj:
                                        data_nof_film += xml_noffilm(
                                            film_name, film_slug, None, None,
                                            film_id, info, None, source.id)
                                        noffilms.append(film_id)

                                    if objt:
                                        year, month, day = d['date'].split(
                                            u'-')

                                        for tm in block.findAll(
                                                'div',
                                            {'class': 'seanse-item'}):
                                            for t in tm.text.encode(
                                                    'utf-8').split('|'):
                                                t = re.findall(
                                                    r'\d{2}\:\d{2}', t)
                                                if t:
                                                    hours, minutes = t[
                                                        0].strip().split(':')
                                                    dtime = datetime.datetime(
                                                        int(year), int(month),
                                                        int(day), int(hours),
                                                        int(minutes))

                                                    sch_id = '%s%s%s' % (
                                                        dtime,
                                                        i.source_id.encode(
                                                            'utf-8'), film_id)
                                                    sch_id = sch_id.replace(
                                                        ' ',
                                                        '').decode('utf-8')

                                                    if sch_id not in schedules:
                                                        SourceSchedules.objects.create(
                                                            source_id=sch_id,
                                                            source_obj=source,
                                                            film=objt,
                                                            cinema=i,
                                                            dtime=dtime,
                                                        )
                                                        schedules.append(
                                                            sch_id)
            time.sleep(random.uniform(1.1, 1.8))

    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
Exemple #26
0
def get_vkinocomua_films_and_schedules():
    ignored = get_ignored_films()

    data_nof_film = ''
    noffilms = []

    source = ImportSources.objects.get(url='http://vkino.com.ua/')
    sfilm_clean(source)
    
    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)
    
    schedules = get_source_data(source, 'schedule', 'list')

    cinemas_data = SourceCinemas.objects.select_related('city').filter(source_obj=source)
    cinemas = {}
    for ind, i in enumerate(cinemas_data):
        url = '%scinema/%s/%s/showtimes' % (source.url, i.city.source_id, i.source_id)
        req = urllib.urlopen(url)
        if req.getcode() == 200:
            data = BeautifulSoup(req.read(), from_encoding="utf-8")
            main = data.find('div', id='cinema-showtimes')
            if main:
                for content in main.findAll('div', {'class': 'content'}):
                    film_tag = content.find('a', {'class': 'navi'})
                    film_name = film_tag.string.encode('utf-8').strip()
                    film_slug = low(del_separator(film_name))
                    
                    full_url = film_tag.get('href').encode('utf-8')
                    film_id = re.findall(r'\/\d+\/', full_url)
                    if film_id:
                        film_id = film_id[0].replace('/','').encode('utf-8')
                    else:
                        film_id = film_slug
                    
                    full_url = '%s%s' % (source.url, full_url.lstrip('/'))
                    

                    if film_id not in noffilms and film_slug.decode('utf-8') not in ignored:

                        obj = films.get(film_id.decode('utf-8'))
                        next_step = checking_obj(obj)
                    
                        if next_step:
                            if obj:
                                kid = obj.kid
                            else:
                                kid, info = film_identification(film_slug, None, {}, {}, source=source)
                        
                            objt = None
                            if kid:
                                create_new, objt = unique_func(fdict, kid, obj)
                                if create_new:
                                    objt = create_sfilm(film_id, kid, source, film_name)
                                    films[film_id] = objt
                                    if not fdict.get(kid):
                                        fdict[kid] = {'editor_rel': [], 'script_rel': []}
                                    fdict[kid]['script_rel'].append(objt)
                            elif not obj:
                                data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, url.encode('utf-8'), source.id)
                                noffilms.append(film_id)
                        

                            if objt:
                                for div in content.findAll('div', {'class': 'date'}):
                                    year, month, day = div['data-date'].split('-')
                                    show = div.find_next_sibling("ul")
                                    for li in show.findAll('li'):
                                    
                                        
                                        if li.a:
                                            extra = li.a.get('href')
                                            hours, minutes = li.a.text.strip().split(':')
                                        else:
                                            extra = None
                                            hours, minutes = li.text.strip().split(':')
                                            
                                        # sale = True if extra else False
                                        
                                        
                                        dtime = datetime.datetime(int(year), int(month), int(day), int(hours), int(minutes))

                                        sch_id = u'%s%s%s%s' % (dtime, i.source_id, i.city_id, film_id.decode('utf-8'))
                                        sch_id = sch_id.replace(' ', '')

                                        if sch_id not in schedules:
                                            SourceSchedules.objects.create(
                                                source_id = sch_id,
                                                source_obj = source,
                                                film = objt,
                                                cinema = i,
                                                dtime = dtime,
                                                extra = extra,
                                            )
                                            schedules.append(sch_id)
        if ind % 4 == 0:
            time.sleep(random.uniform(1.0, 3.0))        
        
    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
Exemple #27
0
def get_premierzal_cinemas():
    source = ImportSources.objects.get(url='http://www.premierzal.ru/')

    cinemas = get_source_data(source, 'cinema', 'list')

    cities_dict = get_source_data(source, 'city', 'dict')

    cinemas_dict = {}
    for i in Cinema.objects.all():
        cinemas_dict[i.code] = i

    ignored_cinemas = get_ignored_cinemas()

    data_nof_cinema = ''

    city = cities_dict.values()[0]

    body = urllib.urlencode({
        'city': city.name.encode('utf-8'),
    })

    url = '%stheatres?%s' % (source.url, body)

    req = urllib.urlopen(url)
    if req.getcode() == 200:
        data = BeautifulSoup(req.read())

        blocks = []

        block1 = data.find('div', {'class': 'this_city_theatres'})
        block2 = data.find('div', {'class': 'other_city_theatres'})

        if block1:
            blocks.append(block1)

        if block2:
            blocks.append(block2)

        for ind, block in enumerate(blocks):
            for a in block.findAll('a'):
                cinema_name = a.text.encode('utf-8').strip().replace('"', '')
                cinema_id = a.get('href').replace('/theatres/',
                                                  '').replace('/', '')

                if ind == 0:
                    city_obj = city
                else:
                    city_name, cinema_name = cinema_name.split(',')
                    cinema_name = cinema_name.strip()
                    city_slug = low(del_separator(city_name.strip()))
                    city_obj = cities_dict.get(city_slug.decode('utf-8'))

                cinema_slug = low(del_separator(cinema_name))

                if city_obj:
                    cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'),
                                                city_obj.city.kid)

                    if cinema_id.decode(
                            'utf-8'
                    ) not in cinemas and cinema_ig_id not in ignored_cinemas:

                        filter1 = {
                            'name__name': cinema_slug,
                            'name__status': 2,
                            'city__id': city_obj.city_id
                        }

                        cinema = cinema_identification(cinema_slug, filter1)

                        cin_obj = cinemas_dict.get(cinema)
                        if cin_obj:
                            SourceCinemas.objects.create(
                                source_id=cinema_id,
                                source_obj=source,
                                city=city_obj,
                                cinema=cin_obj,
                                name=cinema_name,
                            )
                            cinemas.append(cinema_id.decode('utf-8'))
                        else:
                            data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                cinema_name, cinema_slug,
                                city_obj.name.encode('utf-8'),
                                city_obj.city.kid)

    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)
    cron_success('html', source.dump, 'cinemas', 'Кинотеатры')
Exemple #28
0
def get_vkinocomua_cities_and_cinemas():
    nofcities = []
    nofcinemas = []
    data_nof_cinema = ''
    data_nof_city = ''
    
    source = ImportSources.objects.get(url='http://vkino.com.ua/')

    cinemas_dict = get_source_data(source, 'cinema', 'dict')
    cities_dict = get_source_data(source, 'city', 'dict') 
    
    req = urllib.urlopen('%safisha/kiev' % source.url)
    if req.getcode() == 200:
        data = BeautifulSoup(req.read(), from_encoding="utf-8")
        cities_tag = data.find('select', id='city-selector')
        for ind, i in enumerate(cities_tag.findAll('option')):
            if i['value']:
                city_name = i.string.encode('utf-8')
                city_slug = low(del_separator(city_name))
                city_id = i['value'].encode('utf-8')
                
                city_obj = cities_dict.get(city_id)
        
                if not city_obj and city_id not in nofcities:
                    city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk')
                    if city.count() == 1:
                        city_obj = SourceCities.objects.create(
                            source_id = city_id,
                            source_obj = source,
                            city = city[0],
                            name = city_name,
                        )
                        cities_dict[city_id] = city_obj
                    else:
                        data_nof_city += '<city name="%s" slug="%s"></city>' % (city_name, city_slug)
                        nofcities.append(city_id)
                
                if city_obj:
                    url = '%scinema/%s' % (source.url, city_id)
                    req_cinema = urllib.urlopen(url)
                    if req_cinema.getcode() == 200:
                        data_cinema = BeautifulSoup(req_cinema.read(), from_encoding="utf-8")
                        for tag in data_cinema.findAll('a', {'class': 'cinema'}):
                            cinema_name = tag.string.encode('utf-8')
                            cinema_slug = low(del_separator(cinema_name))
                            cinema_id = tag.get('href').replace('/cinema/%s/' % city_id, '').encode('utf-8')
                        
                            cinema_obj = cinemas_dict.get(cinema_id)
                            
                            if not cinema_obj and cinema_id not in nofcinemas:
                                filter = {'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city}
                                cinema_kid = cinema_identification(cinema_slug, filter)
                                if cinema_kid:
                                    try:
                                        cin_obj = Cinema.objects.get(code=cinema_kid)
                                        cinema_obj = SourceCinemas.objects.create(
                                            source_id = cinema_id,
                                            source_obj = source,
                                            city = city_obj,
                                            cinema = cin_obj,
                                            name = cinema_name,
                                        )
                                        cinemas_dict[cinema_id] = cinema_obj
                                    except Cinema.DoesNotExist: pass
                                else:
                                    nofcinemas.append(cinema_id)
                                    data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (cinema_name, cinema_slug, city_name, city_obj.city.kid)

            if ind % 4 == 0:
                time.sleep(random.uniform(1.0, 3.0))
        
    create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city)
    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema)
    cron_success('html', source.dump, 'cities_and_cinemas', 'Города и кинотеатры')
Exemple #29
0
def get_zapad24ru():
    ignored = get_ignored_films()
    ignored_cinemas = get_ignored_cinemas()

    source = ImportSources.objects.get(url='http://zapad24.ru/')
    sfilm_clean(source)

    cities_dict = get_source_data(source, 'city', 'dict')
    cinemas_dict = get_source_data(source, 'cinema', 'dict')
    schedules = get_source_data(source, 'schedule', 'list')

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    today = datetime.datetime.now()
    next_month = datetime.date.today() + datetime.timedelta(days=40)

    data_nof_films = ''
    data_nof_cinema = ''
    data_nof_city = ''
    noffilms = []

    req = urllib.urlopen('%safisha/' % source.url)
    if req.getcode() == 200:
        data = BeautifulSoup(req.read())  #, from_encoding="utf-8"
        div = data.find('div', align="left")
        for ind, table in enumerate(
                div.findAll('table',
                            border="0",
                            cellpadding="0",
                            cellspacing="0",
                            width="100%")):
            cinema_tag = table.find('strong').string.encode('utf-8')
            cinema_name = re.findall(r'\".+\"',
                                     cinema_tag)[0].replace('"', '').strip()
            cinema_slug = low(del_separator(cinema_name))
            cinema_id = cinema_slug.decode('utf-8')

            city_name = re.findall(r'\(.+\)', cinema_tag)[0].replace(
                '(г. ', '').replace(')', '').strip()
            city_slug = low(del_separator(city_name))
            city_id = city_slug.decode('utf-8')

            city_obj = cities_dict.get(city_id)

            if not city_obj:
                city = City.objects.filter(name__name=city_slug,
                                           name__status=2).distinct('pk')
                if city.count() == 1:
                    city_obj = SourceCities.objects.create(
                        source_id=city_id,
                        source_obj=source,
                        city=city[0],
                        name=city_name,
                    )
                    cities_dict[city_id] = city_obj
                else:
                    if 'slug="%s"' % city_slug not in data_nof_city:
                        data_nof_city += '<city name="%s" slug="%s"></city>' % (
                            city_name, city_slug)

            if city_obj:
                cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'),
                                            city_obj.city.kid)

                if cinema_ig_id not in ignored_cinemas:
                    cinema_obj = cinemas_dict.get(cinema_id)
                    if not cinema_obj:
                        filter1 = {
                            'name__name': cinema_slug,
                            'name__status': 2,
                            'city': city_obj.city
                        }
                        cinema_kid = cinema_identification(
                            cinema_slug, filter1)
                        if cinema_kid:
                            try:
                                cinema = Cinema.objects.get(code=cinema_kid)
                                cinema_obj = SourceCinemas.objects.create(
                                    source_id=cinema_id,
                                    source_obj=source,
                                    city=city_obj,
                                    cinema=cinema,
                                    name=cinema_name,
                                )
                                cinemas_dict[cinema_id] = cinema_obj
                            except Cinema.DoesNotExist:
                                pass
                        else:
                            if 'slug="%s"' % cinema_slug not in data_nof_cinema:
                                data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                    cinema_name, cinema_slug, city_name,
                                    city_obj.city.kid)

                    if cinema_obj:
                        film_table = table.find('table')
                        date_from = None
                        date_to = None
                        for tr in film_table.findAll('tr'):
                            film_name, film_slug, film_id = (None, None, None)
                            if ind == 0:
                                film_name = tr.find('b').string.encode(
                                    'utf-8').strip()
                                film_slug = low(del_separator(film_name))
                                film_id = film_slug.decode('utf-8')
                            else:
                                showdate = ''
                                for f in tr.findAll('b'):
                                    if f.find('span'):
                                        showdate = f.find(
                                            'span').string.encode(
                                                'utf-8').strip()
                                    else:
                                        film_name = f.string.encode(
                                            'utf-8').strip()
                                        film_name = re.findall(
                                            r'\«.+\»', film_name)[0]
                                        film_name = film_name.replace(
                                            '«', '').replace('»', '').strip()
                                        film_slug = low(
                                            del_separator(film_name))
                                        film_id = film_slug.decode('utf-8')

                                if showdate and film_name:
                                    try:
                                        date_from, date_to = showdate.split(
                                            '-')
                                        date_from_day, date_from_month = date_from.strip(
                                        ).split('.')
                                        date_to_day, date_to_month = date_to.strip(
                                        ).split('.')
                                    except ValueError:
                                        date_from, date_to = showdate.split(
                                            ' – ')
                                        date_from_day, date_from_month = date_from.strip(
                                        ).split()
                                        date_from_month = get_month(
                                            date_from_month)
                                        date_to_day, date_to_month = date_to.strip(
                                        ).split()
                                        date_to_month = get_month(
                                            date_to_month)

                                    date_from = datetime.date(
                                        today.year, int(date_from_month),
                                        int(date_from_day))
                                    date_to = datetime.date(
                                        today.year, int(date_to_month),
                                        int(date_to_day))

                            full_url = tr.find('a').get('href').encode('utf-8')

                            if film_id not in noffilms and film_id not in ignored:
                                obj = films.get(film_id)
                                next_step = checking_obj(obj)

                                if next_step:
                                    if obj:
                                        kid = obj.kid
                                    else:
                                        kid, info = film_identification(
                                            film_slug,
                                            None, {}, {},
                                            source=source)

                                    objt = None
                                    if kid:
                                        create_new, objt = unique_func(
                                            fdict, kid, obj)
                                        if create_new:
                                            objt = create_sfilm(
                                                film_id, kid, source,
                                                film_name)
                                            films[film_id] = objt
                                            if not fdict.get(kid):
                                                fdict[kid] = {
                                                    'editor_rel': [],
                                                    'script_rel': []
                                                }
                                            fdict[kid]['script_rel'].append(
                                                objt)
                                    elif not obj:
                                        data_nof_film += xml_noffilm(
                                            film_name, film_slug, None, None,
                                            film_id.encode('utf-8'), info,
                                            full_url, source.id)
                                        noffilms.append(film_id)

                                    if objt:
                                        req_film = urllib.urlopen(full_url)
                                        if req_film.getcode() == 200:
                                            data_film = BeautifulSoup(
                                                req_film.read()
                                            )  #, from_encoding="utf-8"

                                            td = data_film.find(
                                                'td', {
                                                    'class': 'news'
                                                }).div.text.encode('utf-8')

                                            showtime = []

                                            if ind == 0:
                                                showtime = re.findall(
                                                    r'\d+\:\d+\s\s?', td)
                                            else:
                                                if date_from and date_to:
                                                    if date_to < next_month:
                                                        showtimes = re.findall(
                                                            r'Начало сеансов:\s?[\d+\-\d+\,?\s?]+',
                                                            td)
                                                        times = []
                                                        for t in showtimes:
                                                            t = t.replace(
                                                                'Начало сеансов:',
                                                                '').split(',')
                                                            times = [
                                                                i.strip()
                                                                for i in t
                                                                if i.strip()
                                                            ]

                                                        delta = date_to - date_from
                                                        for day in range(
                                                                delta.days +
                                                                1):
                                                            d = date_from + datetime.timedelta(
                                                                days=day)
                                                            for t in times:
                                                                hours, minutes = t.split(
                                                                    '-')
                                                                dtime = datetime.datetime(
                                                                    d.year,
                                                                    d.month,
                                                                    d.day,
                                                                    int(hours),
                                                                    int(minutes
                                                                        ))
                                                                showtime.append(
                                                                    dtime)

                                            for t in showtime:
                                                if ind == 0:
                                                    hours, minutes = t.strip(
                                                    ).split(':')
                                                    dtime = datetime.datetime(
                                                        today.year,
                                                        today.month, today.day,
                                                        int(hours),
                                                        int(minutes))
                                                else:
                                                    dtime = t

                                                sch_id = '%s%s%s%s' % (
                                                    dtime, cinema_slug,
                                                    city_slug,
                                                    film_id.encode('utf-8'))
                                                sch_id = sch_id.replace(
                                                    ' ', '').decode('utf-8')

                                                if sch_id not in schedules:
                                                    SourceSchedules.objects.create(
                                                        source_id=sch_id,
                                                        source_obj=source,
                                                        film=objt,
                                                        cinema=cinema_obj,
                                                        dtime=dtime,
                                                    )
                                                    schedules.append(sch_id)

    create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_city)
    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)
    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_films)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
Exemple #30
0
def get_bigyalta_organizations():
    REG_ADDR = re.compile(r'Адрес\:\s?.+')
    REG_TEL = re.compile(r'Телефон\:\s?.+')
    REG_MAIL = re.compile(r'E-mail\:\s?.+')
    REG_SITE = re.compile(r'Сайт\:\s?.+')

    source = ImportSources.objects.get(url='http://www.bigyalta.info/')

    org_phones = OrganizationPhones.objects.filter(
        organization__source_obj=source)
    phones_objs = {}
    for i in org_phones:
        phones_objs[i.phone] = i

    org_tags = OrganizationTags.objects.filter(organization__source_obj=source)
    tags_objs = {}
    for i in org_tags:
        tags_objs[i.name] = i

    org_streets = Street.objects.all()
    org_streets_dict = {}
    for i in org_streets:
        org_streets_dict[i.slug.encode('utf-8')] = i

    source_ids = list(
        Organization.objects.filter(source_obj=source).values_list('source_id',
                                                                   flat=True))

    city_name = 'Ялта'

    city = City.objects.get(name__name=city_name, name__status=1)

    with open('%s/organizations_bigyalta.xml' % settings.API_DUMP_PATH,
              'r') as f:
        data = BeautifulSoup(f.read(), from_encoding="utf-8")

    count = 0
    for i in data.findAll('url'):
        count += 1

        url = i['value'].encode('utf-8')
        title = i['name'].encode('utf-8')

        source_id = url.replace(
            'http://www.bigyalta.info/business/index.php?show=',
            '').decode('utf-8')

        if source_id not in source_ids:
            req = urllib.urlopen(url)
            if req.getcode() == 200:
                data = BeautifulSoup(req.read(), from_encoding="utf-8")
                div = data.find(
                    'div',
                    style='float:left; width:670px; padding-right:10px;')
                div = div.text.encode('utf-8')

                addr = REG_ADDR.findall(div)
                if addr:
                    addr = addr[0].replace('Адрес:', '').replace('Ялта,',
                                                                 '').strip()

                    street_name, street_type, house = get_org_street(
                        addr.decode('utf-8'))

                    if street_type:
                        if street_name:
                            street_slug = low(del_separator(street_name))
                            street_obj = org_streets_dict.get(street_slug)
                            if not street_obj:
                                street_obj = Street.objects.create(
                                    name=street_name,
                                    slug=street_slug,
                                    type=street_type)
                                org_streets_dict[street_slug] = street_obj
                        else:
                            street_obj = None
                            house = None

                        building_obj = org_build_create(
                            house, city, street_obj)

                        # ТЕЛЕФОНЫ, САЙТ, МЭЙЛ
                        phones_temp = REG_TEL.findall(div)

                        email = REG_MAIL.findall(div)
                        if email:
                            email = email[0].replace('E-mail:', '').strip()
                            if not email:
                                email = None

                        site = REG_SITE.findall(div)
                        if site:
                            site = site[0].replace('Сайт:', '').strip()
                            if not site:
                                site = None

                        phones = []
                        if phones_temp:
                            phones_temp = phones_temp[0].replace(
                                'Телефон:',
                                '').replace(' ',
                                            '').replace('-',
                                                        '').replace('–', '')

                            phone = REG_PHONE.findall(phones_temp)
                            if phone:
                                phone = phone[0].decode('utf-8')
                                phone_obj = phones_objs.get(phone)
                                if not phone_obj:
                                    phone_obj = OrganizationPhones.objects.create(
                                        phone=phone)
                                    phones_objs[phone] = phone_obj
                                phones.append(phone_obj)

                        # МЕТКИ (КАТЕГОРИИ, ТЕГИ)
                        tags = []
                        for cat in i.findAll('tag'):
                            category_name = cat['value']
                            category_obj = tags_objs.get(category_name)
                            if not category_obj:
                                category_obj = OrganizationTags.objects.create(
                                    name=category_name)
                                tags_objs[category_name] = category_obj
                            tags.append(category_obj)

                        org_obj = Organization.objects.create(
                            name=title,
                            site=site,
                            email=email,
                            note=None,
                            source_obj=source,
                            source_id=source_id,
                        )

                        for j in phones:
                            org_obj.phones.add(j)

                        for j in tags:
                            org_obj.tags.add(j)

                        org_obj.buildings.add(building_obj)

                        source_ids.append(source_id)

        if count % 10 == 0:
            time.sleep(random.uniform(1.0, 3.0))

    return HttpResponse(str('finish'))