Ejemplo n.º 1
0
def get_zapad24ru():
    ignored = get_ignored_films()
    ignored_cinemas = get_ignored_cinemas()

    source = ImportSources.objects.get(url='http://zapad24.ru/')
    sfilm_clean(source)

    cities_dict = get_source_data(source, 'city', 'dict')
    cinemas_dict = get_source_data(source, 'cinema', 'dict')
    schedules = get_source_data(source, 'schedule', 'list')

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    today = datetime.datetime.now()
    next_month = datetime.date.today() + datetime.timedelta(days=40)

    data_nof_films = ''
    data_nof_cinema = ''
    data_nof_city = ''
    noffilms = []

    req = urllib.urlopen('%safisha/' % source.url)
    if req.getcode() == 200:
        data = BeautifulSoup(req.read())  #, from_encoding="utf-8"
        div = data.find('div', align="left")
        for ind, table in enumerate(
                div.findAll('table',
                            border="0",
                            cellpadding="0",
                            cellspacing="0",
                            width="100%")):
            cinema_tag = table.find('strong').string.encode('utf-8')
            cinema_name = re.findall(r'\".+\"',
                                     cinema_tag)[0].replace('"', '').strip()
            cinema_slug = low(del_separator(cinema_name))
            cinema_id = cinema_slug.decode('utf-8')

            city_name = re.findall(r'\(.+\)', cinema_tag)[0].replace(
                '(г. ', '').replace(')', '').strip()
            city_slug = low(del_separator(city_name))
            city_id = city_slug.decode('utf-8')

            city_obj = cities_dict.get(city_id)

            if not city_obj:
                city = City.objects.filter(name__name=city_slug,
                                           name__status=2).distinct('pk')
                if city.count() == 1:
                    city_obj = SourceCities.objects.create(
                        source_id=city_id,
                        source_obj=source,
                        city=city[0],
                        name=city_name,
                    )
                    cities_dict[city_id] = city_obj
                else:
                    if 'slug="%s"' % city_slug not in data_nof_city:
                        data_nof_city += '<city name="%s" slug="%s"></city>' % (
                            city_name, city_slug)

            if city_obj:
                cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'),
                                            city_obj.city.kid)

                if cinema_ig_id not in ignored_cinemas:
                    cinema_obj = cinemas_dict.get(cinema_id)
                    if not cinema_obj:
                        filter1 = {
                            'name__name': cinema_slug,
                            'name__status': 2,
                            'city': city_obj.city
                        }
                        cinema_kid = cinema_identification(
                            cinema_slug, filter1)
                        if cinema_kid:
                            try:
                                cinema = Cinema.objects.get(code=cinema_kid)
                                cinema_obj = SourceCinemas.objects.create(
                                    source_id=cinema_id,
                                    source_obj=source,
                                    city=city_obj,
                                    cinema=cinema,
                                    name=cinema_name,
                                )
                                cinemas_dict[cinema_id] = cinema_obj
                            except Cinema.DoesNotExist:
                                pass
                        else:
                            if 'slug="%s"' % cinema_slug not in data_nof_cinema:
                                data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                    cinema_name, cinema_slug, city_name,
                                    city_obj.city.kid)

                    if cinema_obj:
                        film_table = table.find('table')
                        date_from = None
                        date_to = None
                        for tr in film_table.findAll('tr'):
                            film_name, film_slug, film_id = (None, None, None)
                            if ind == 0:
                                film_name = tr.find('b').string.encode(
                                    'utf-8').strip()
                                film_slug = low(del_separator(film_name))
                                film_id = film_slug.decode('utf-8')
                            else:
                                showdate = ''
                                for f in tr.findAll('b'):
                                    if f.find('span'):
                                        showdate = f.find(
                                            'span').string.encode(
                                                'utf-8').strip()
                                    else:
                                        film_name = f.string.encode(
                                            'utf-8').strip()
                                        film_name = re.findall(
                                            r'\«.+\»', film_name)[0]
                                        film_name = film_name.replace(
                                            '«', '').replace('»', '').strip()
                                        film_slug = low(
                                            del_separator(film_name))
                                        film_id = film_slug.decode('utf-8')

                                if showdate and film_name:
                                    try:
                                        date_from, date_to = showdate.split(
                                            '-')
                                        date_from_day, date_from_month = date_from.strip(
                                        ).split('.')
                                        date_to_day, date_to_month = date_to.strip(
                                        ).split('.')
                                    except ValueError:
                                        date_from, date_to = showdate.split(
                                            ' – ')
                                        date_from_day, date_from_month = date_from.strip(
                                        ).split()
                                        date_from_month = get_month(
                                            date_from_month)
                                        date_to_day, date_to_month = date_to.strip(
                                        ).split()
                                        date_to_month = get_month(
                                            date_to_month)

                                    date_from = datetime.date(
                                        today.year, int(date_from_month),
                                        int(date_from_day))
                                    date_to = datetime.date(
                                        today.year, int(date_to_month),
                                        int(date_to_day))

                            full_url = tr.find('a').get('href').encode('utf-8')

                            if film_id not in noffilms and film_id not in ignored:
                                obj = films.get(film_id)
                                next_step = checking_obj(obj)

                                if next_step:
                                    if obj:
                                        kid = obj.kid
                                    else:
                                        kid, info = film_identification(
                                            film_slug,
                                            None, {}, {},
                                            source=source)

                                    objt = None
                                    if kid:
                                        create_new, objt = unique_func(
                                            fdict, kid, obj)
                                        if create_new:
                                            objt = create_sfilm(
                                                film_id, kid, source,
                                                film_name)
                                            films[film_id] = objt
                                            if not fdict.get(kid):
                                                fdict[kid] = {
                                                    'editor_rel': [],
                                                    'script_rel': []
                                                }
                                            fdict[kid]['script_rel'].append(
                                                objt)
                                    elif not obj:
                                        data_nof_film += xml_noffilm(
                                            film_name, film_slug, None, None,
                                            film_id.encode('utf-8'), info,
                                            full_url, source.id)
                                        noffilms.append(film_id)

                                    if objt:
                                        req_film = urllib.urlopen(full_url)
                                        if req_film.getcode() == 200:
                                            data_film = BeautifulSoup(
                                                req_film.read()
                                            )  #, from_encoding="utf-8"

                                            td = data_film.find(
                                                'td', {
                                                    'class': 'news'
                                                }).div.text.encode('utf-8')

                                            showtime = []

                                            if ind == 0:
                                                showtime = re.findall(
                                                    r'\d+\:\d+\s\s?', td)
                                            else:
                                                if date_from and date_to:
                                                    if date_to < next_month:
                                                        showtimes = re.findall(
                                                            r'Начало сеансов:\s?[\d+\-\d+\,?\s?]+',
                                                            td)
                                                        times = []
                                                        for t in showtimes:
                                                            t = t.replace(
                                                                'Начало сеансов:',
                                                                '').split(',')
                                                            times = [
                                                                i.strip()
                                                                for i in t
                                                                if i.strip()
                                                            ]

                                                        delta = date_to - date_from
                                                        for day in range(
                                                                delta.days +
                                                                1):
                                                            d = date_from + datetime.timedelta(
                                                                days=day)
                                                            for t in times:
                                                                hours, minutes = t.split(
                                                                    '-')
                                                                dtime = datetime.datetime(
                                                                    d.year,
                                                                    d.month,
                                                                    d.day,
                                                                    int(hours),
                                                                    int(minutes
                                                                        ))
                                                                showtime.append(
                                                                    dtime)

                                            for t in showtime:
                                                if ind == 0:
                                                    hours, minutes = t.strip(
                                                    ).split(':')
                                                    dtime = datetime.datetime(
                                                        today.year,
                                                        today.month, today.day,
                                                        int(hours),
                                                        int(minutes))
                                                else:
                                                    dtime = t

                                                sch_id = '%s%s%s%s' % (
                                                    dtime, cinema_slug,
                                                    city_slug,
                                                    film_id.encode('utf-8'))
                                                sch_id = sch_id.replace(
                                                    ' ', '').decode('utf-8')

                                                if sch_id not in schedules:
                                                    SourceSchedules.objects.create(
                                                        source_id=sch_id,
                                                        source_obj=source,
                                                        film=objt,
                                                        cinema=cinema_obj,
                                                        dtime=dtime,
                                                    )
                                                    schedules.append(sch_id)

    create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_city)
    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)
    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_films)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
Ejemplo n.º 2
0
def get_premierzal_cinemas():
    source = ImportSources.objects.get(url='http://www.premierzal.ru/')

    cinemas = get_source_data(source, 'cinema', 'list')

    cities_dict = get_source_data(source, 'city', 'dict')

    cinemas_dict = {}
    for i in Cinema.objects.all():
        cinemas_dict[i.code] = i

    ignored_cinemas = get_ignored_cinemas()

    data_nof_cinema = ''

    city = cities_dict.values()[0]

    body = urllib.urlencode({
        'city': city.name.encode('utf-8'),
    })

    url = '%stheatres?%s' % (source.url, body)

    req = urllib.urlopen(url)
    if req.getcode() == 200:
        data = BeautifulSoup(req.read())

        blocks = []

        block1 = data.find('div', {'class': 'this_city_theatres'})
        block2 = data.find('div', {'class': 'other_city_theatres'})

        if block1:
            blocks.append(block1)

        if block2:
            blocks.append(block2)

        for ind, block in enumerate(blocks):
            for a in block.findAll('a'):
                cinema_name = a.text.encode('utf-8').strip().replace('"', '')
                cinema_id = a.get('href').replace('/theatres/',
                                                  '').replace('/', '')

                if ind == 0:
                    city_obj = city
                else:
                    city_name, cinema_name = cinema_name.split(',')
                    cinema_name = cinema_name.strip()
                    city_slug = low(del_separator(city_name.strip()))
                    city_obj = cities_dict.get(city_slug.decode('utf-8'))

                cinema_slug = low(del_separator(cinema_name))

                if city_obj:
                    cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'),
                                                city_obj.city.kid)

                    if cinema_id.decode(
                            'utf-8'
                    ) not in cinemas and cinema_ig_id not in ignored_cinemas:

                        filter1 = {
                            'name__name': cinema_slug,
                            'name__status': 2,
                            'city__id': city_obj.city_id
                        }

                        cinema = cinema_identification(cinema_slug, filter1)

                        cin_obj = cinemas_dict.get(cinema)
                        if cin_obj:
                            SourceCinemas.objects.create(
                                source_id=cinema_id,
                                source_obj=source,
                                city=city_obj,
                                cinema=cin_obj,
                                name=cinema_name,
                            )
                            cinemas.append(cinema_id.decode('utf-8'))
                        else:
                            data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                cinema_name, cinema_slug,
                                city_obj.name.encode('utf-8'),
                                city_obj.city.kid)

    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)
    cron_success('html', source.dump, 'cinemas', 'Кинотеатры')
Ejemplo n.º 3
0
def get_megamag():
    '''
    Получение urls фильмов
    '''
    import cookielib

    def give_me_cookie():
        cookie = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie),
                                      urllib2.HTTPHandler())
        return opener

    ignored = get_ignored_films()

    ignored_cinemas = get_ignored_cinemas()

    source = ImportSources.objects.get(url='http://megamag.by/')
    sfilm_clean(source)

    megamag_cities_dict = get_source_data(source, 'city', 'dict')
    megamag_cinemas_dict = get_source_data(source, 'cinema', 'dict')

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    cities_data = {}

    data_nof_films = ''
    data_nof_cinema = ''
    data_nof_city = ''
    noffilms = []
    schedules_data = []

    opener = give_me_cookie()
    req = opener.open(urllib2.Request('http://kinoteatr.megamag.by/index.php'))
    event_dict = {}
    if req.getcode() == 200:
        data = BeautifulSoup(req.read(), from_encoding="utf-8")

        cities = data.find('div', id="box-region")

        for i in cities.findAll('a'):

            city_name = i.text.encode('utf-8')
            city_slug = low(del_separator(city_name))
            city_id = i.get('href').replace(
                'http://kinoteatr.megamag.by/index.php?region_id=', '')

            mcity = megamag_cities_dict.get(city_id)

            if not mcity:
                city = City.objects.filter(name__name=city_slug,
                                           name__status=2).distinct('pk')
                if city.count() == 1:
                    mcity = SourceCities.objects.create(
                        source_id=city_id,
                        source_obj=source,
                        city=city[0],
                        name=city_name,
                    )
                else:
                    if 'slug="%s"' % city_slug not in data_nof_city:
                        data_nof_city += '<city name="%s" slug="%s"></city>' % (
                            city_name, city_slug)

            if mcity:
                cities_data[city_name] = mcity

        try:
            cinemas_tag = data.findAll('td',
                                       {'class': 'Cinema_new_box_1_BoxText'},
                                       limit=1)[0]
        except IndexError:
            cinemas_tag = None

        if cinemas_tag:
            for i in cinemas_tag.findAll('a'):
                cinema_url = i.get('href')
                cinema_id = cinema_url.replace(
                    'http://kinoteatr.megamag.by/index.php?cPath=', '')
                cinema_obj = megamag_cinemas_dict.get(cinema_id)

                opener = give_me_cookie()
                try:
                    req2 = opener.open(urllib2.Request(cinema_url))

                    if req2.getcode() == 200:
                        schedules_page = BeautifulSoup(req2.read(),
                                                       from_encoding="utf-8")
                        city_name = schedules_page.findAll(
                            'div', {'class': 'object_param_value'},
                            limit=1)[0].text.encode('utf-8')

                        city_obj = cities_data.get(city_name)
                        if city_obj:
                            cinema_name = schedules_page.find(
                                'div', {
                                    'class': 'object_title'
                                }).text.encode('utf-8')
                            cinema_name = cinema_name.replace('"', '').replace(
                                'Кинотеатр', '')
                            cinema_slug = low(del_separator(cinema_name))

                            cinema_ig_id = u'%s__%s' % (
                                cinema_slug.decode('utf-8'), city_obj.city.kid)

                            if cinema_ig_id not in ignored_cinemas:

                                if not cinema_obj:
                                    filter1 = {
                                        'name__name': cinema_slug,
                                        'name__status': 2,
                                        'city': city_obj.city
                                    }
                                    cinema_kid = cinema_identification(
                                        cinema_slug, filter1)
                                    if cinema_kid:
                                        try:
                                            cinema = Cinema.objects.get(
                                                code=cinema_kid)
                                            cinema_obj = SourceCinemas.objects.create(
                                                source_id=cinema_id,
                                                source_obj=source,
                                                city=city_obj,
                                                cinema=cinema,
                                                name=cinema_name,
                                            )
                                        except Cinema.DoesNotExist:
                                            pass
                                else:
                                    cinema_kid = cinema_obj.cinema.code

                                if cinema_kid:
                                    for event in schedules_page.findAll(
                                            'td', {'class': 'eventsHeading'}):
                                        if event.a.get('name'):
                                            ev = event.a['name'].split('_')[1]
                                            fname = event.a.text.encode(
                                                'utf-8')
                                            fid = event.a.get('href').replace(
                                                'http://kinoteatr.megamag.by/newsdesk_info.php?newsdesk_id=',
                                                '')
                                            event_dict[int(ev)] = {
                                                'name': fname,
                                                'id': int(fid)
                                            }

                                    links = []
                                    for td in schedules_page.findAll(
                                            'td', {'class': 'main'}):
                                        for link in td.findAll('a'):
                                            l = link.get('href')
                                            if l and 'cPath' in l:
                                                links.append(l)
                                    schedules_data.append({
                                        'mcity':
                                        city_obj,
                                        'city':
                                        city_obj.city,
                                        'mcinema':
                                        cinema_obj,
                                        'cinema':
                                        cinema_kid,
                                        'schedules':
                                        set(links)
                                    })
                                else:
                                    if 'slug="%s"' % cinema_slug not in data_nof_cinema:
                                        data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                            cinema_name, cinema_slug,
                                            city_name, city_obj.city.kid)
                except httplib.HTTPException:
                    pass
        create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                         '<data>%s</data>' % data_nof_city)
        create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                         '<data>%s</data>' % data_nof_cinema)

        megamag = get_source_data(source, 'schedule', 'list')

        for obj in schedules_data:
            cinema_object = obj['mcinema']

            for index, i in enumerate(obj['schedules']):
                opener = give_me_cookie()
                try:
                    req3 = opener.open(urllib2.Request(i))
                    if req3.getcode() == 200:

                        id_schedule = i.replace(
                            'http://kinoteatr.megamag.by/index.php?cPath=',
                            '').encode('utf-8')
                        if id_schedule not in megamag:
                            sch_page = BeautifulSoup(req3.read(),
                                                     from_encoding="utf-8")

                            tables = sch_page.findAll('table', {
                                'class':
                                'Cinema_new_box_2_TemplateCenterPart'
                            },
                                                      limit=1)[0]
                            main_table = tables.findAll('table',
                                                        cellpadding='4',
                                                        limit=1)[0]
                            tr = main_table.findAll('tr')[1]
                            td = tr.findAll('strong')

                            event_id = id_schedule.split('_')[2]
                            film_data = event_dict.get(int(event_id))
                            if film_data:
                                film_name = film_data['name']
                                film_name_slug = low(
                                    del_separator(del_screen_type(film_name)))
                                film_id = film_data['id']

                                if film_id not in noffilms and film_name_slug.decode(
                                        'utf-8') not in ignored:

                                    obj = films.get(
                                        str(film_id).decode('utf-8'))
                                    next_step = checking_obj(obj)

                                    if next_step:
                                        if obj:
                                            kid = obj.kid
                                        else:
                                            kid, info = film_identification(
                                                film_name_slug,
                                                None, {}, {},
                                                source=source)

                                        objt = None
                                        if kid:
                                            create_new, objt = unique_func(
                                                fdict, kid, obj)
                                            if create_new:
                                                objt = create_sfilm(
                                                    film_id, kid, source,
                                                    film_name)
                                                films[str(film_id).decode(
                                                    'utf-8')] = objt
                                                if not fdict.get(kid):
                                                    fdict[kid] = {
                                                        'editor_rel': [],
                                                        'script_rel': []
                                                    }
                                                fdict[kid][
                                                    'script_rel'].append(objt)
                                        elif not obj:
                                            data_nof_films += xml_noffilm(
                                                film_name, film_name_slug,
                                                None, None, film_id, info,
                                                None, source.id)
                                            noffilms.append(film_id)

                                        if objt:
                                            dtime_info = td[1].text.encode(
                                                'utf-8').split()
                                            year_info = datetime.datetime.now(
                                            ).year
                                            day_info = int(dtime_info[0])
                                            month_low = low(
                                                dtime_info[1].replace(',', ''))
                                            month_info = int(
                                                get_month(month_low))
                                            time_info = dtime_info[-1].replace(
                                                '(', '').replace(')',
                                                                 '').split(':')

                                            dtime = datetime.datetime(
                                                year_info, month_info,
                                                day_info, int(time_info[0]),
                                                int(time_info[1]), 0)
                                            SourceSchedules.objects.create(
                                                source_id=id_schedule,
                                                source_obj=source,
                                                cinema=cinema_object,
                                                film=objt,
                                                dtime=dtime,
                                            )
                except httplib.HTTPException:
                    open('%s/httplib_errors.txt' % settings.API_DUMP_PATH,
                         'a').write('%s\n' % i)
                # на каждом 60 обращении к источнику делаю паузу в 2 секунды
                if (index + 1) % 60 == 0:
                    time.sleep(2.0)

    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_films)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
Ejemplo n.º 4
0
def get_rambler_cinemas():
    data_nof_cinema = ''
    source = ImportSources.objects.get(url='http://www.rambler.ru/')

    cinemas_ids = get_source_data(source, 'cinema', 'list')
    rambler_cities_dict = get_source_data(source, 'city', 'dict')

    cinemass = Cinema.objects.all()
    cinemass_dict = {}
    for i in cinemass:
        cinemass_dict[i.code] = i

    ignored_cinemas = get_ignored_cinemas()
    '''
    # LOCALHOST
    f = open('%s/dump_rambler_cinema.xml' % settings.API_DUMP_PATH, 'r')
    xml = BeautifulSoup(f.read(), from_encoding="utf-8")
    f.close()
    if xml:
        if xml: # --- end localhost
    '''
    # SERVER
    f = open('%s/dump_rambler_index.xml' % settings.API_DUMP_PATH, 'r')
    xml_index = BeautifulSoup(f.read(), from_encoding="utf-8")
    f.close()
    places = xml_index.find('places')
    filenames = []
    for i in places.findAll('file'):
        filename = i.get('filename')
        if filename:
            filenames.append(filename)

    for i in filenames:
        url = 'http://api.kassa.rambler.ru/v2/%s/xml/Movie/export/sale/%s' % (
            RAMBLER_API_KEY, i)
        req = urllib.urlopen(url)
        if req.getcode() == 200:
            xml = BeautifulSoup(req.read(),
                                from_encoding="utf-8")  # --- end server

            for i in xml.findAll('place'):
                id = i.objectid.string
                name = i.find('name').string.encode('utf-8')
                name_slug = low(del_separator(name))
                address = i.address.string.encode(
                    'utf-8') if i.address.string else None
                latitude = i.latitude.string
                longitude = i.longitude.string
                city_id = i.cityid.string
                city_obj = rambler_cities_dict.get(city_id)

                if city_obj:
                    cinema_ig_id = u'%s__%s' % (name_slug.decode('utf-8'),
                                                city_obj.city.kid)

                    if id not in cinemas_ids and cinema_ig_id not in ignored_cinemas:
                        filter1 = {
                            'name__name': name_slug,
                            'name__status': 2,
                            'city__id': city_obj.city_id
                        }
                        cinema = cinema_identification(name_slug, filter1)
                        cin_obj = cinemass_dict.get(cinema)
                        if cin_obj:
                            SourceCinemas.objects.create(
                                source_id=id,
                                source_obj=source,
                                city=city_obj,
                                cinema=cin_obj,
                                name=name,
                                address=address,
                                latitude=latitude,
                                longitude=longitude,
                            )
                        else:
                            if 'slug="%s"' % name_slug not in data_nof_cinema:
                                name_city = city_obj.name
                                data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                    name, name_slug, name_city.encode('utf-8'),
                                    city_obj.city.kid)

    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)
    cron_success('xml', source.dump, 'cinemas', 'Кинотеатры')
Ejemplo n.º 5
0
def get_luxor_cinemas():
    query = 'QueryCode=GetHalls'

    data = get_luxor_data_by_socket(query)

    source = ImportSources.objects.get(url='http://luxor.ru/')

    #create_dump_file('%s_cinemas' % source.dump, settings.API_DUMP_PATH, data)

    data_nof_cinema = ''
    data_nof_city = ''
    data_nof_hall = ''
    nofcities = []
    nofcinemas = []

    cinemas = get_source_data(source, 'cinema', 'dict')
    cities = get_source_data(source, 'city', 'dict')

    ignored_cinemas = get_ignored_cinemas()

    halls = get_source_data(source, 'hall', 'dict')
    '''
    xml = open('%s/dump_%s_cinemas.xml' % (settings.API_DUMP_PATH, source.dump), 'r')# temp
    data = xml.read()# temp
    xml.close()# temp
    '''
    xml_data = BeautifulSoup(data, from_encoding="utf-8")

    for cinema in xml_data.findAll('theatre'):
        cinema_id = cinema['id'].encode('utf-8')
        cinema_name = cinema.find('name').text.encode('utf-8')
        cinema_name = cinema_name.replace('[CDATA[', '').replace(']]',
                                                                 '').strip()
        cinema_slug = low(del_separator(cinema_name))

        cinema_alt_name = 'Люксор'
        cinema_alt_slug = 'люксор'

        address = cinema.find('address').text.encode('utf-8')
        address = address.replace('[CDATA[',
                                  '').replace(']]', '').replace('"',
                                                                "'").strip()

        city_obj = cities.get(cinema_slug.decode('utf-8'))
        if not city_obj and cinema_slug not in nofcities:
            city = City.objects.filter(name__name=cinema_slug,
                                       name__status=2).distinct('pk')
            if city.count() == 1:
                city_obj = SourceCities.objects.create(
                    source_id=cinema_slug,
                    source_obj=source,
                    city=city[0],
                    name=cinema_name,
                )
                cities[cinema_slug] = city_obj
            else:
                data_nof_city += '<city name="%s" slug="%s" info="%s"></city>' % (
                    cinema_name, cinema_slug, address)
                nofcities.append(cinema_slug)

        if city_obj:
            cinema_obj = cinemas.get(cinema_id)
            city_kid = city_obj.city.kid

            cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'), city_kid)

            if cinema_ig_id not in ignored_cinemas:

                if not cinema_obj:
                    filter1 = {
                        'name__name': cinema_slug,
                        'name__status': 2,
                        'city__kid': city_kid
                    }
                    cinema_kid = cinema_identification(cinema_slug, filter1,
                                                       {}, city_kid)
                    if cinema_kid:
                        cin_obj = Cinema.objects.get(code=cinema_kid)
                        cinema_obj = SourceCinemas.objects.create(
                            source_id=cinema_id,
                            source_obj=source,
                            city=city_obj,
                            cinema=cin_obj,
                            name=cinema_name,
                        )
                        cinemas[cinema_id] = cinema_obj
                    else:
                        city_name = ''
                        for i in city_obj.city.name.all():
                            if i.status == 1:
                                city_name = i.name.encode('utf-8')

                        data_nof_cinema += '<cinema name="Люксор %s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                            cinema_name, cinema_slug, city_name, city_kid)

                if cinema_obj:
                    for i in cinema.findAll('hall'):
                        hall_id = i['id'].encode('utf-8')
                        hall_name = i.find('name').string.encode('utf-8')
                        hall_name = hall_name.replace('[CDATA[',
                                                      '').replace(']]',
                                                                  '').strip()
                        hall_slug = low(del_separator(hall_name))

                        hall_obj = halls.get(hall_id)
                        if not hall_obj:
                            hall_obj = Hall.objects.filter(
                                name__name=hall_slug,
                                cinema=cinema_obj.cinema).distinct('pk')
                            if hall_obj.count() == 1:
                                hall_kid = hall_obj[0].kid

                                SourceHalls.objects.create(
                                    source_id=hall_id,
                                    source_obj=source,
                                    cinema=cinema_obj,
                                    name=hall_name,
                                    kid=hall_kid,
                                )
                            else:
                                city_name = ''
                                for i in city_obj.city.name.all():
                                    if i.status == 1:
                                        city_name = i.name.encode('utf-8')

                                cinema_kid = cinema_obj.cinema.code
                                id = '%s%s%s%s' % (city_kid, cinema_kid,
                                                   hall_name, hall_slug)
                                id = id.replace(' ', '')
                                data_nof_hall += '<hall city="%s" city_kid="%s" cinema="Люксор %s" cinema_kid="%s" name="%s" slug="%s" id="%s"></hall>' % (
                                    city_name, city_kid, cinema_name,
                                    cinema_kid, hall_name, hall_slug, id)

    create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_city)
    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)
    create_dump_file('%s_nof_hall' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_hall)

    cron_success('xml', source.dump, 'cities_and_cinemas',
                 'Города и кинотеатры')
Ejemplo n.º 6
0
def get_surkino_cinemas():
    data_nof_cinema = ''
    source = ImportSources.objects.get(url='http://surkino.ru/')

    city_name = 'Сургут'
    city_slug = low(del_separator(city_name))
    city = City.objects.get(name__name=city_name, name__status=1)

    city_obj, city_created = SourceCities.objects.get_or_create(
        source_id=city_slug,
        source_obj=source,
        defaults={
            'source_id': city_slug,
            'source_obj': source,
            'city': city,
            'name': city_name,
        })

    cinemas = get_source_data(source, 'cinema', 'list')

    ignored_cinemas = get_ignored_cinemas()

    req = urllib.urlopen(source.url)
    if req.getcode() == 200:
        data = BeautifulSoup(req.read(), from_encoding="windows-1251")

        div = data.find('div', {'class': 'cinemas'})

        div_classes = ['ciname', 'ciname last']
        for cl in div_classes:
            for cinema_tag in div.findAll('div', {'class': cl}):
                cinema_name = cinema_tag.a.get('title').encode(
                    'utf-8').replace('Кинотеатр ', '')
                cinema_slug = low(del_separator(cinema_name))
                cinema_id = cinema_tag.a.get('href').replace('?cinema=', '')

                cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'),
                                            city_obj.city.kid)

                if cinema_id not in cinemas and cinema_ig_id not in ignored_cinemas:
                    filter1 = {
                        'name__name': cinema_slug,
                        'name__status': 2,
                        'city__id': city_obj.city_id
                    }
                    cinema_kid = cinema_identification(cinema_slug, filter1)
                    if cinema_kid:
                        try:
                            cinema = Cinema.objects.get(code=cinema_kid)
                            cinema_obj = SourceCinemas.objects.create(
                                source_id=cinema_id,
                                source_obj=source,
                                city=city_obj,
                                cinema=cinema,
                                name=cinema_name,
                            )
                        except Cinema.DoesNotExist:
                            pass
                    else:
                        if 'slug="%s"' % cinema_slug not in data_nof_cinema:
                            data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                cinema_name, cinema_slug,
                                city_obj.name.encode('utf-8'),
                                city_obj.city.kid)

    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)
    cron_success('html', source.dump, 'cinemas', 'Кинотеатры')