Beispiel #1
0
def get_luxor_schedules():

    query = 'QueryCode=GetSessions'

    data = get_luxor_data_by_socket(query)

    source = ImportSources.objects.get(url='http://luxor.ru/')

    #create_dump_file('%s_schedules' % source.dump, settings.API_DUMP_PATH, data)
    '''
    xml = open('%s/dump_%s_schedules.xml' % (settings.API_DUMP_PATH, source.dump), 'r')# temp
    data = xml.read()# temp
    xml.close()# temp
    '''
    films = get_source_data(source, 'film', 'dict')
    cinemas = get_source_data(source, 'cinema', 'dict')
    halls = get_source_data(source, 'hall', 'dict')
    schedules = get_source_data(source, 'schedule', 'list')

    xml_data = BeautifulSoup(data, from_encoding="utf-8")

    for session in xml_data.findAll('session'):
        sch_id = session['id']
        if sch_id not in schedules:
            cinema_id = session.theatre['id'].encode('utf-8')
            hall_id = session.theatre.hall['id'].encode('utf-8')
            film_id = session.movie['id'].encode('utf-8')

            cinema_obj = cinemas.get(cinema_id)
            film_obj = films.get(film_id)
            hall_obj = halls.get(hall_id)

            if cinema_obj and film_obj and hall_obj:
                showdate = session.date.string.encode('utf-8')
                showtime = session.time.string.encode('utf-8')

                day, month, year = showdate.split('.')
                hours, minutes = showtime.split(':')

                dtime = datetime.datetime(int(year), int(month), int(day),
                                          int(hours), int(minutes))

                SourceSchedules.objects.create(
                    source_id=sch_id,
                    source_obj=source,
                    film=film_obj,
                    cinema=cinema_obj,
                    dtime=dtime,
                    hall=hall_obj.kid,
                )

    cron_success('xml', source.dump, 'schedules', 'Сеансы')
Beispiel #2
0
def get_kinoteatrua_releases():
    '''
    Получение укр.релизов
    '''
    opener = give_me_cookie()

    source = ImportSources.objects.get(url='http://kino-teatr.ua/')

    films_dict = get_source_data(source, 'film', 'dict')

    releases = SourceReleases.objects.select_related('film').filter(
        source_obj=source)
    releases_dict = {}
    for i in releases:
        releases_dict[i.film.source_id] = i

    url = '%sfilms-near.phtml' % source.url

    req = opener.open(urllib2.Request(url))
    if req.getcode() == 200:
        data = BeautifulSoup(req.read(), from_encoding="utf-8")
        for ind, i in enumerate(data.findAll('a',
                                             {'class': 'searchItemLink'})):
            film_url = i.get('href')
            film_id = film_url.replace('http://kino-teatr.ua/film/',
                                       '').replace('.phtml',
                                                   '').encode('utf-8')
            film_obj = films_dict.get(film_id)
            if film_obj:
                req2 = opener.open(urllib2.Request(film_url))
                if req2.getcode() == 200:
                    data2 = BeautifulSoup(req2.read(), from_encoding="utf-8")
                    block = data2.find('div', id='filmInfo')
                    strong = block.find('strong',
                                        text=u"Премьера (в Украине): ")
                    day, month, year = strong.find_next_sibling(
                        "a").text.strip().split('.')
                    showdate = datetime.date(int(year), int(month), int(day))
                    release_obj = releases_dict.get(film_id)
                    if release_obj:
                        if release_obj.release != showdate:
                            release_obj.release = showdate
                            release_obj.save()
                    else:
                        release_obj = SourceReleases.objects.create(
                            source_obj=source,
                            film=film_obj,
                            release=showdate,
                        )
                        releases_dict[film_id] = release_obj

            if ind % 1 == 0:
                time.sleep(random.uniform(1.0, 3.0))

    cron_success('html', source.dump, 'releases', 'Укр.релизы')
Beispiel #3
0
def get_okinoua_cities():
    """
    Парсинг городов Украины
    """
    source = ImportSources.objects.get(url='http://www.okino.ua/')

    # Получаем список городов с таблицы SourceCities в виде списка
    cities_ids = get_source_data(source, 'city', 'list')
    data_nof_city = ''

    # Открываем страницу с городами
    url = '%skinoafisha-kiev/' % source.url
    req = urllib.urlopen(url)
    if req.getcode() == 200:
        page = BeautifulSoup(req.read(), from_encoding="utf-8")
        # Находим все теги с городами и считываем из них id и названия городов
        for ul in page.findAll('ul', {'class': 'blist'}):
            for li in ul.findAll('li'):
                id = li.a.get('href').replace('/', '')
                name = li.a.string.encode('utf-8').strip()
                name_slug = low(del_separator(name))
                # Сравниваем полученные города с городами в нашей БД и, если НЕТ совпадений, то
                if id not in cities_ids:
                    # идентифицируем новый город
                    city = City.objects.filter(name__name=name_slug,
                                               name__status=2).distinct('pk')
                    # если идентифицировали, то записываем в таблицу SourceCities
                    if city.count() == 1:
                        SourceCities.objects.create(
                            source_id=id,
                            source_obj=source,
                            city=city[0],
                            name=name,
                        )
                    # в противном случаем записываем ненайденые города в xml для дальнейших действий над ними
                    else:
                        if 'slug="%s"' % name_slug not in data_nof_city:
                            data_nof_city += '<city name="%s" slug="%s"></city>' % (
                                name, name_slug)

    create_dump_file('okinoua_nof_city', settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_city)
    cron_success('html', 'okinoua', 'cities', 'Укр. города')
Beispiel #4
0
def get_rambler_cities():
    source = ImportSources.objects.get(url='http://www.rambler.ru/')

    cities_ids = get_source_data(source, 'city', 'list')
    data_nof_city = ''
    '''
    # LOCALHOST
    f = open('%s/dump_rambler_city.xml' % settings.API_DUMP_PATH, 'r')
    xml = BeautifulSoup(f.read(), from_encoding="utf-8")
    f.close()
    if xml: # --- end localhost
    '''

    # SERVER
    url = 'http://api.kassa.rambler.ru/v2/%s/xml/cities/' % RAMBLER_API_KEY  # dump_rambler_city.xml
    req = urllib.urlopen(url)
    if req.getcode() == 200:
        xml = BeautifulSoup(req.read(),
                            from_encoding="utf-8")  # --- end server

        for i in xml.findAll('city'):
            id = i.cityid.string
            name = i.find('name').string.encode('utf-8')
            name_slug = low(del_separator(name))
            if id not in cities_ids:
                city = City.objects.filter(name__name=name_slug,
                                           name__status=2).distinct('pk')
                if city.count() == 1:
                    SourceCities.objects.create(
                        source_id=id,
                        source_obj=source,
                        city=city[0],
                        name=name,
                    )
                else:
                    if 'slug="%s"' % name_slug not in data_nof_city:
                        data_nof_city += '<city name="%s" slug="%s"></city>' % (
                            name, name_slug)

    create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_city)
    cron_success('xml', source.dump, 'cities', 'Города')
Beispiel #5
0
def get_premierzal_cities():
    source = ImportSources.objects.get(url='http://www.premierzal.ru/')

    cities = get_source_data(source, 'city', 'list')

    data_nof_city = ''

    req = urllib.urlopen(source.url)
    if req.getcode() == 200:
        data = BeautifulSoup(req.read())

        block = data.find('div', {'class': 'drop'})

        for i in block.findAll('a'):
            city_name = i.text.encode('utf-8').strip()
            city_id = low(del_separator(city_name))

            if city_id.decode('utf-8') not in cities:

                city = City.objects.filter(name__name=city_id,
                                           name__status=2).distinct('pk')

                if city.count() == 1:
                    SourceCities.objects.create(
                        source_id=city_id,
                        source_obj=source,
                        city=city[0],
                        name=city_name,
                    )
                else:
                    data_nof_city += '<city name="%s" slug="%s"></city>' % (
                        city_name, city_id)

                cities.append(city_id.decode('utf-8'))

    create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_city)
    cron_success('html', source.dump, 'cities', 'Города')
Beispiel #6
0
def get_ivi_file():
    '''
    Получение txt файла
    '''
    source = ImportSources.objects.get(url='http://antipiracy.ivi.ru/')
    films = get_source_data(source, 'film', 'list')
    url = '%s-/' % source.url

    req = urllib.urlopen(url)
    if req.getcode() == 200:
        links = BeautifulSoup(req.read(), from_encoding="windows-1251")

        for i in links.findAll('a'):
            link = i.string.encode('utf-8')
            if 'in one file.txt' in link:
                req2 = urllib.urlopen('%s%s' % (url, i.get('href')))
                data = BeautifulSoup(req2.read(), from_encoding="windows-1251")
                file = str(data).replace('<html><head></head><body>',
                                         '').replace('</body></html>', '')
                create_dump_file(source.dump, settings.API_DUMP_PATH, file,
                                 'txt')

    cron_success('html', source.dump, 'file', 'txt файл с данными')
Beispiel #7
0
def get_vkinocomua_cities_and_cinemas():
    nofcities = []
    nofcinemas = []
    data_nof_cinema = ''
    data_nof_city = ''
    
    source = ImportSources.objects.get(url='http://vkino.com.ua/')

    cinemas_dict = get_source_data(source, 'cinema', 'dict')
    cities_dict = get_source_data(source, 'city', 'dict') 
    
    req = urllib.urlopen('%safisha/kiev' % source.url)
    if req.getcode() == 200:
        data = BeautifulSoup(req.read(), from_encoding="utf-8")
        cities_tag = data.find('select', id='city-selector')
        for ind, i in enumerate(cities_tag.findAll('option')):
            if i['value']:
                city_name = i.string.encode('utf-8')
                city_slug = low(del_separator(city_name))
                city_id = i['value'].encode('utf-8')
                
                city_obj = cities_dict.get(city_id)
        
                if not city_obj and city_id not in nofcities:
                    city = City.objects.filter(name__name=city_slug, name__status=2).distinct('pk')
                    if city.count() == 1:
                        city_obj = SourceCities.objects.create(
                            source_id = city_id,
                            source_obj = source,
                            city = city[0],
                            name = city_name,
                        )
                        cities_dict[city_id] = city_obj
                    else:
                        data_nof_city += '<city name="%s" slug="%s"></city>' % (city_name, city_slug)
                        nofcities.append(city_id)
                
                if city_obj:
                    url = '%scinema/%s' % (source.url, city_id)
                    req_cinema = urllib.urlopen(url)
                    if req_cinema.getcode() == 200:
                        data_cinema = BeautifulSoup(req_cinema.read(), from_encoding="utf-8")
                        for tag in data_cinema.findAll('a', {'class': 'cinema'}):
                            cinema_name = tag.string.encode('utf-8')
                            cinema_slug = low(del_separator(cinema_name))
                            cinema_id = tag.get('href').replace('/cinema/%s/' % city_id, '').encode('utf-8')
                        
                            cinema_obj = cinemas_dict.get(cinema_id)
                            
                            if not cinema_obj and cinema_id not in nofcinemas:
                                filter = {'name__name': cinema_slug, 'name__status': 2, 'city': city_obj.city}
                                cinema_kid = cinema_identification(cinema_slug, filter)
                                if cinema_kid:
                                    try:
                                        cin_obj = Cinema.objects.get(code=cinema_kid)
                                        cinema_obj = SourceCinemas.objects.create(
                                            source_id = cinema_id,
                                            source_obj = source,
                                            city = city_obj,
                                            cinema = cin_obj,
                                            name = cinema_name,
                                        )
                                        cinemas_dict[cinema_id] = cinema_obj
                                    except Cinema.DoesNotExist: pass
                                else:
                                    nofcinemas.append(cinema_id)
                                    data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (cinema_name, cinema_slug, city_name, city_obj.city.kid)

            if ind % 4 == 0:
                time.sleep(random.uniform(1.0, 3.0))
        
    create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_city)
    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema)
    cron_success('html', source.dump, 'cities_and_cinemas', 'Города и кинотеатры')
Beispiel #8
0
def get_megamag():
    '''
    Получение urls фильмов
    '''
    import cookielib

    def give_me_cookie():
        cookie = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie),
                                      urllib2.HTTPHandler())
        return opener

    ignored = get_ignored_films()

    ignored_cinemas = get_ignored_cinemas()

    source = ImportSources.objects.get(url='http://megamag.by/')
    sfilm_clean(source)

    megamag_cities_dict = get_source_data(source, 'city', 'dict')
    megamag_cinemas_dict = get_source_data(source, 'cinema', 'dict')

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    cities_data = {}

    data_nof_films = ''
    data_nof_cinema = ''
    data_nof_city = ''
    noffilms = []
    schedules_data = []

    opener = give_me_cookie()
    req = opener.open(urllib2.Request('http://kinoteatr.megamag.by/index.php'))
    event_dict = {}
    if req.getcode() == 200:
        data = BeautifulSoup(req.read(), from_encoding="utf-8")

        cities = data.find('div', id="box-region")

        for i in cities.findAll('a'):

            city_name = i.text.encode('utf-8')
            city_slug = low(del_separator(city_name))
            city_id = i.get('href').replace(
                'http://kinoteatr.megamag.by/index.php?region_id=', '')

            mcity = megamag_cities_dict.get(city_id)

            if not mcity:
                city = City.objects.filter(name__name=city_slug,
                                           name__status=2).distinct('pk')
                if city.count() == 1:
                    mcity = SourceCities.objects.create(
                        source_id=city_id,
                        source_obj=source,
                        city=city[0],
                        name=city_name,
                    )
                else:
                    if 'slug="%s"' % city_slug not in data_nof_city:
                        data_nof_city += '<city name="%s" slug="%s"></city>' % (
                            city_name, city_slug)

            if mcity:
                cities_data[city_name] = mcity

        try:
            cinemas_tag = data.findAll('td',
                                       {'class': 'Cinema_new_box_1_BoxText'},
                                       limit=1)[0]
        except IndexError:
            cinemas_tag = None

        if cinemas_tag:
            for i in cinemas_tag.findAll('a'):
                cinema_url = i.get('href')
                cinema_id = cinema_url.replace(
                    'http://kinoteatr.megamag.by/index.php?cPath=', '')
                cinema_obj = megamag_cinemas_dict.get(cinema_id)

                opener = give_me_cookie()
                try:
                    req2 = opener.open(urllib2.Request(cinema_url))

                    if req2.getcode() == 200:
                        schedules_page = BeautifulSoup(req2.read(),
                                                       from_encoding="utf-8")
                        city_name = schedules_page.findAll(
                            'div', {'class': 'object_param_value'},
                            limit=1)[0].text.encode('utf-8')

                        city_obj = cities_data.get(city_name)
                        if city_obj:
                            cinema_name = schedules_page.find(
                                'div', {
                                    'class': 'object_title'
                                }).text.encode('utf-8')
                            cinema_name = cinema_name.replace('"', '').replace(
                                'Кинотеатр', '')
                            cinema_slug = low(del_separator(cinema_name))

                            cinema_ig_id = u'%s__%s' % (
                                cinema_slug.decode('utf-8'), city_obj.city.kid)

                            if cinema_ig_id not in ignored_cinemas:

                                if not cinema_obj:
                                    filter1 = {
                                        'name__name': cinema_slug,
                                        'name__status': 2,
                                        'city': city_obj.city
                                    }
                                    cinema_kid = cinema_identification(
                                        cinema_slug, filter1)
                                    if cinema_kid:
                                        try:
                                            cinema = Cinema.objects.get(
                                                code=cinema_kid)
                                            cinema_obj = SourceCinemas.objects.create(
                                                source_id=cinema_id,
                                                source_obj=source,
                                                city=city_obj,
                                                cinema=cinema,
                                                name=cinema_name,
                                            )
                                        except Cinema.DoesNotExist:
                                            pass
                                else:
                                    cinema_kid = cinema_obj.cinema.code

                                if cinema_kid:
                                    for event in schedules_page.findAll(
                                            'td', {'class': 'eventsHeading'}):
                                        if event.a.get('name'):
                                            ev = event.a['name'].split('_')[1]
                                            fname = event.a.text.encode(
                                                'utf-8')
                                            fid = event.a.get('href').replace(
                                                'http://kinoteatr.megamag.by/newsdesk_info.php?newsdesk_id=',
                                                '')
                                            event_dict[int(ev)] = {
                                                'name': fname,
                                                'id': int(fid)
                                            }

                                    links = []
                                    for td in schedules_page.findAll(
                                            'td', {'class': 'main'}):
                                        for link in td.findAll('a'):
                                            l = link.get('href')
                                            if l and 'cPath' in l:
                                                links.append(l)
                                    schedules_data.append({
                                        'mcity':
                                        city_obj,
                                        'city':
                                        city_obj.city,
                                        'mcinema':
                                        cinema_obj,
                                        'cinema':
                                        cinema_kid,
                                        'schedules':
                                        set(links)
                                    })
                                else:
                                    if 'slug="%s"' % cinema_slug not in data_nof_cinema:
                                        data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                            cinema_name, cinema_slug,
                                            city_name, city_obj.city.kid)
                except httplib.HTTPException:
                    pass
        create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                         '<data>%s</data>' % data_nof_city)
        create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                         '<data>%s</data>' % data_nof_cinema)

        megamag = get_source_data(source, 'schedule', 'list')

        for obj in schedules_data:
            cinema_object = obj['mcinema']

            for index, i in enumerate(obj['schedules']):
                opener = give_me_cookie()
                try:
                    req3 = opener.open(urllib2.Request(i))
                    if req3.getcode() == 200:

                        id_schedule = i.replace(
                            'http://kinoteatr.megamag.by/index.php?cPath=',
                            '').encode('utf-8')
                        if id_schedule not in megamag:
                            sch_page = BeautifulSoup(req3.read(),
                                                     from_encoding="utf-8")

                            tables = sch_page.findAll('table', {
                                'class':
                                'Cinema_new_box_2_TemplateCenterPart'
                            },
                                                      limit=1)[0]
                            main_table = tables.findAll('table',
                                                        cellpadding='4',
                                                        limit=1)[0]
                            tr = main_table.findAll('tr')[1]
                            td = tr.findAll('strong')

                            event_id = id_schedule.split('_')[2]
                            film_data = event_dict.get(int(event_id))
                            if film_data:
                                film_name = film_data['name']
                                film_name_slug = low(
                                    del_separator(del_screen_type(film_name)))
                                film_id = film_data['id']

                                if film_id not in noffilms and film_name_slug.decode(
                                        'utf-8') not in ignored:

                                    obj = films.get(
                                        str(film_id).decode('utf-8'))
                                    next_step = checking_obj(obj)

                                    if next_step:
                                        if obj:
                                            kid = obj.kid
                                        else:
                                            kid, info = film_identification(
                                                film_name_slug,
                                                None, {}, {},
                                                source=source)

                                        objt = None
                                        if kid:
                                            create_new, objt = unique_func(
                                                fdict, kid, obj)
                                            if create_new:
                                                objt = create_sfilm(
                                                    film_id, kid, source,
                                                    film_name)
                                                films[str(film_id).decode(
                                                    'utf-8')] = objt
                                                if not fdict.get(kid):
                                                    fdict[kid] = {
                                                        'editor_rel': [],
                                                        'script_rel': []
                                                    }
                                                fdict[kid][
                                                    'script_rel'].append(objt)
                                        elif not obj:
                                            data_nof_films += xml_noffilm(
                                                film_name, film_name_slug,
                                                None, None, film_id, info,
                                                None, source.id)
                                            noffilms.append(film_id)

                                        if objt:
                                            dtime_info = td[1].text.encode(
                                                'utf-8').split()
                                            year_info = datetime.datetime.now(
                                            ).year
                                            day_info = int(dtime_info[0])
                                            month_low = low(
                                                dtime_info[1].replace(',', ''))
                                            month_info = int(
                                                get_month(month_low))
                                            time_info = dtime_info[-1].replace(
                                                '(', '').replace(')',
                                                                 '').split(':')

                                            dtime = datetime.datetime(
                                                year_info, month_info,
                                                day_info, int(time_info[0]),
                                                int(time_info[1]), 0)
                                            SourceSchedules.objects.create(
                                                source_id=id_schedule,
                                                source_obj=source,
                                                cinema=cinema_object,
                                                film=objt,
                                                dtime=dtime,
                                            )
                except httplib.HTTPException:
                    open('%s/httplib_errors.txt' % settings.API_DUMP_PATH,
                         'a').write('%s\n' % i)
                # на каждом 60 обращении к источнику делаю паузу в 2 секунды
                if (index + 1) % 60 == 0:
                    time.sleep(2.0)

    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_films)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
Beispiel #9
0
def get_vkinocomua_films_and_schedules():
    ignored = get_ignored_films()

    data_nof_film = ''
    noffilms = []

    source = ImportSources.objects.get(url='http://vkino.com.ua/')
    sfilm_clean(source)
    
    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)
    
    schedules = get_source_data(source, 'schedule', 'list')

    cinemas_data = SourceCinemas.objects.select_related('city').filter(source_obj=source)
    cinemas = {}
    for ind, i in enumerate(cinemas_data):
        url = '%scinema/%s/%s/showtimes' % (source.url, i.city.source_id, i.source_id)
        req = urllib.urlopen(url)
        if req.getcode() == 200:
            data = BeautifulSoup(req.read(), from_encoding="utf-8")
            main = data.find('div', id='cinema-showtimes')
            if main:
                for content in main.findAll('div', {'class': 'content'}):
                    film_tag = content.find('a', {'class': 'navi'})
                    film_name = film_tag.string.encode('utf-8').strip()
                    film_slug = low(del_separator(film_name))
                    
                    full_url = film_tag.get('href').encode('utf-8')
                    film_id = re.findall(r'\/\d+\/', full_url)
                    if film_id:
                        film_id = film_id[0].replace('/','').encode('utf-8')
                    else:
                        film_id = film_slug
                    
                    full_url = '%s%s' % (source.url, full_url.lstrip('/'))
                    

                    if film_id not in noffilms and film_slug.decode('utf-8') not in ignored:

                        obj = films.get(film_id.decode('utf-8'))
                        next_step = checking_obj(obj)
                    
                        if next_step:
                            if obj:
                                kid = obj.kid
                            else:
                                kid, info = film_identification(film_slug, None, {}, {}, source=source)
                        
                            objt = None
                            if kid:
                                create_new, objt = unique_func(fdict, kid, obj)
                                if create_new:
                                    objt = create_sfilm(film_id, kid, source, film_name)
                                    films[film_id] = objt
                                    if not fdict.get(kid):
                                        fdict[kid] = {'editor_rel': [], 'script_rel': []}
                                    fdict[kid]['script_rel'].append(objt)
                            elif not obj:
                                data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, url.encode('utf-8'), source.id)
                                noffilms.append(film_id)
                        

                            if objt:
                                for div in content.findAll('div', {'class': 'date'}):
                                    year, month, day = div['data-date'].split('-')
                                    show = div.find_next_sibling("ul")
                                    for li in show.findAll('li'):
                                    
                                        
                                        if li.a:
                                            extra = li.a.get('href')
                                            hours, minutes = li.a.text.strip().split(':')
                                        else:
                                            extra = None
                                            hours, minutes = li.text.strip().split(':')
                                            
                                        # sale = True if extra else False
                                        
                                        
                                        dtime = datetime.datetime(int(year), int(month), int(day), int(hours), int(minutes))

                                        sch_id = u'%s%s%s%s' % (dtime, i.source_id, i.city_id, film_id.decode('utf-8'))
                                        sch_id = sch_id.replace(' ', '')

                                        if sch_id not in schedules:
                                            SourceSchedules.objects.create(
                                                source_id = sch_id,
                                                source_obj = source,
                                                film = objt,
                                                cinema = i,
                                                dtime = dtime,
                                                extra = extra,
                                            )
                                            schedules.append(sch_id)
        if ind % 4 == 0:
            time.sleep(random.uniform(1.0, 3.0))        
        
    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
Beispiel #10
0
def get_cinemaarthall_schedules():
    ignored = get_ignored_films()

    data_nof_film = ''
    noffilms = []

    city_name = 'Норильск'
    cinema_name = 'Синема-АРТ-Холл'
    city_slug = low(del_separator(city_name))
    cinema_slug = low(del_separator(cinema_name))

    source = ImportSources.objects.get(url='http://cinemaarthall.ru/')
    sfilm_clean(source)

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    schedules = get_source_data(source, 'schedule', 'list')

    city = City.objects.get(name__name=city_name, name__status=1)
    cinema = Cinema.objects.get(name__name=cinema_name,
                                name__status=1,
                                city=city)

    city_obj, city_created = SourceCities.objects.get_or_create(
        source_id=city_slug,
        source_obj=source,
        defaults={
            'source_id': city_slug,
            'source_obj': source,
            'city': city,
            'name': city_name,
        })

    cinema_obj, cinema_created = SourceCinemas.objects.get_or_create(
        source_id=cinema_slug,
        source_obj=source,
        defaults={
            'source_id': cinema_slug,
            'source_obj': source,
            'city': city_obj,
            'cinema': cinema,
            'name': cinema_name,
        })

    dates = []
    url = '%spage/kino/films/' % source.url

    req = urllib.urlopen(url)
    if req.getcode() == 200:
        data = BeautifulSoup(req.read())
        show_days = data.find('div', id='datachek')
        for a in show_days.findAll('a'):
            day = a.get('href').replace('/page/kino/films/&date=', '')
            dates.append(day)

    for d in dates:
        url = '%spage/kino/films/&date=%s' % (source.url, d)
        req = urllib.urlopen(url)
        if req.getcode() == 200:
            data = BeautifulSoup(req.read())
            for div in data.findAll('div', {'class': 'media-block'}):
                film_name = div.find('h3')
                if film_name:
                    film_name = film_name.string.encode('utf-8')
                    film_id = div.findAll('a', limit=1)[0].get('href').replace(
                        '/', '').encode('utf-8')
                    film_slug = del_screen_type(low(del_separator(film_name)))
                    full_url = '%spage/kino/films/%s' % (source.url, film_id)

                    if film_id not in noffilms and film_slug.decode(
                            'utf-8') not in ignored:

                        obj = films.get(film_id)
                        next_step = checking_obj(obj)

                        if next_step:
                            if obj:
                                kid = obj.kid
                            else:
                                kid, info = film_identification(film_slug,
                                                                None, {}, {},
                                                                source=source)

                            objt = None
                            if kid:
                                create_new, objt = unique_func(fdict, kid, obj)
                                if create_new:
                                    objt = create_sfilm(
                                        film_id, kid, source, film_name)
                                    films[film_id] = objt
                                    if not fdict.get(kid):
                                        fdict[kid] = {
                                            'editor_rel': [],
                                            'script_rel': []
                                        }
                                    fdict[kid]['script_rel'].append(objt)
                            elif not obj:
                                data_nof_film += xml_noffilm(
                                    film_name, film_slug, None, None, film_id,
                                    info, full_url.encode('utf-8'), source.id)
                                noffilms.append(film_id)

                            if objt:
                                div_sess = div.find('div', {'class': 'filmr'})
                                for t in div_sess.findAll('span'):
                                    if t.string:
                                        t = t.string.split(',')[0]
                                        hours, minutes = t.split(':')
                                        day, month, year = d.split('.')
                                        dtime = datetime.datetime(
                                            int(year), int(month), int(day),
                                            int(hours), int(minutes))

                                        sch_id = '%s%s%s%s' % (
                                            dtime, cinema_slug, city_slug,
                                            film_id)
                                        sch_id = sch_id.replace(
                                            ' ', '').decode('utf-8')

                                        if sch_id not in schedules:
                                            SourceSchedules.objects.create(
                                                source_id=sch_id,
                                                source_obj=source,
                                                film=objt,
                                                cinema=cinema_obj,
                                                dtime=dtime,
                                            )
                                            schedules.append(sch_id)

    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
def page_parser(city_name, cinema_name, source):
    ignored = get_ignored_films()

    data_nof_film = ''
    noffilms = []

    city_slug = low(del_separator(city_name))
    cinema_slug = low(del_separator(cinema_name))

    source = ImportSources.objects.get(url=source)
    sfilm_clean(source)

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    schedules = get_source_data(source, 'schedule', 'list')

    city = City.objects.get(name__name=city_name, name__status=1)
    cinema = Cinema.objects.get(name__name=cinema_name,
                                name__status=1,
                                city=city)

    city_obj, city_created = SourceCities.objects.get_or_create(
        source_id=city_slug,
        source_obj=source,
        defaults={
            'source_id': city_slug,
            'source_obj': source,
            'city': city,
            'name': city_name,
        })

    cinema_obj, cinema_created = SourceCinemas.objects.get_or_create(
        source_id=cinema_slug,
        source_obj=source,
        defaults={
            'source_id': cinema_slug,
            'source_obj': source,
            'city': city_obj,
            'cinema': cinema,
            'name': cinema_name,
        })

    main_url = '%ssessions/' % source.url
    today = datetime.date.today()
    next_week = today + datetime.timedelta(days=6)

    delta = next_week - today
    for day in range(delta.days + 1):
        date_obj = today + datetime.timedelta(days=day)

        url = '%s%s' % (main_url, date_obj)
        req = urllib.urlopen(url)
        if req.getcode() == 200:
            data = BeautifulSoup(req.read(), from_encoding="utf-8")
            main = data.find('div', id="section-session")
            if main:
                main = main.find('table')
                for tr in main.findAll('tr'):
                    showtime, film = tr.findAll('td', limit=2)
                    hours, minutes = showtime.string.split(':')
                    film_a = film.findAll('a')

                    if film_a:
                        film_a = film_a[1] if len(film_a) > 1 else film_a[0]
                        full_url = film_a.get('href')
                        film_id = full_url.replace('%sfilms/',
                                                   source.url).replace(
                                                       '/', '').encode('utf-8')
                        film_name = del_screen_type(
                            film_a.get('title').encode('utf-8')).strip()
                        film_slug = low(del_separator(film_name))

                        if film_id not in noffilms and film_slug.decode(
                                'utf-8') not in ignored:

                            obj = films.get(film_id)
                            next_step = checking_obj(obj)

                            if next_step:
                                if obj:
                                    kid = obj.kid
                                else:
                                    kid, info = film_identification(
                                        film_slug, None, {}, {}, source=source)

                                objt = None
                                if kid:
                                    create_new, objt = unique_func(
                                        fdict, kid, obj)
                                    if create_new:
                                        objt = create_sfilm(
                                            film_id, kid, source, film_name)
                                        films[film_id] = objt
                                        if not fdict.get(kid):
                                            fdict[kid] = {
                                                'editor_rel': [],
                                                'script_rel': []
                                            }
                                        fdict[kid]['script_rel'].append(objt)
                                elif not obj:
                                    data_nof_film += xml_noffilm(
                                        film_name, film_slug, None,
                                        None, film_id, info,
                                        full_url.encode('utf-8'), source.id)
                                    noffilms.append(film_id)

                                if objt:
                                    dtime = datetime.datetime(
                                        date_obj.year, date_obj.month,
                                        date_obj.day, int(hours), int(minutes))
                                    sch_id = '%s%s%s%s' % (dtime, cinema_slug,
                                                           city_slug, film_id)
                                    sch_id = sch_id.replace(' ',
                                                            '').decode('utf-8')

                                    if sch_id not in schedules:
                                        SourceSchedules.objects.create(
                                            source_id=sch_id,
                                            source_obj=source,
                                            film=objt,
                                            cinema=cinema_obj,
                                            dtime=dtime,
                                        )
                                        schedules.append(sch_id)
    return data_nof_film
Beispiel #12
0
def get_kinohod_schedules():
    #    print "BEGIN get_kinohod_schedules()"
    t1 = time.time()
    start_time = datetime.datetime.now().strftime('%H:%M:%S')

    opener = urllib2.build_opener()
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1',
    }
    opener.addheaders = headers.items()

    cron_data_new = 0
    cron_data_new_sale = 0
    cron_data_nof = ''
    cron_count = 0
    cron_count_sale = 0
    film_list = []
    film_nof_list = []
    cinemas_count = 0

    source = ImportSources.objects.get(url='http://kinohod.ru/')
    kinohod_cinemas_dict = get_source_data(source, 'cinema', 'dict')
    kinohod_films_dict = get_source_data(source, 'film', 'dict')

    today = datetime.datetime.now().date()
    today_add_seven_days = today + datetime.timedelta(days=6)

    kinohod_schedules = list(
        SourceSchedules.objects.filter(
            dtime__gte=today, source_obj=source).values_list('source_id',
                                                             flat=True))

    for cinema_id, cinema_obj in kinohod_cinemas_dict.iteritems():
        today_temp = today
        while today_temp <= today_add_seven_days:
            today_str = today_temp.strftime("%d%m%Y")
            today_temp += datetime.timedelta(days=1)

            url = 'http://www.kinohod.ru/api/rest/partner/v1/cinemas/%s/schedules?apikey=%s&date=%s' % (
                cinema_id, SERVER_API_KEY, today_str)

            #url = 'http://www.kinohod.ru/api/rest/partner/v1/cinemas/%s/schedules?apikey=%s' % (cinema_id, SERVER_API_KEY)
            #url = 'http://www.kinohod.ru/api/rest/partner/v1/cinemas/300/schedules?apikey=%s' % SERVER_API_KEY # ------ !!!!
            try:
                try:
                    req = opener.open(url)
                    if req.getcode() == 200:
                        cinemas_count += 1
                        json_data = req.read()
                        data = json.loads(json_data)

                        for i in data:
                            film_id = str(i['movie']['id'])
                            film_obj = kinohod_films_dict.get(film_id)

                            if film_obj:
                                film_list.append(film_id)
                            else:
                                film_nof_list.append(film_id)

                            for s in i['schedules']:
                                sale = s['isSaleAllowed']
                                cron_count += 1
                                if sale:
                                    cron_count_sale += 1

                                if film_obj:
                                    id = str(s['id'])
                                    if id not in kinohod_schedules:
                                        show_d = s['startTime'].split(
                                            'T')[0].split('-')
                                        dtime = datetime.date(
                                            int(show_d[0]), int(show_d[1]),
                                            int(show_d[2]))

                                        hour = int(s['time'].split(':')[0])
                                        if hour >= 0 and hour <= 5:
                                            dtime = dtime - datetime.timedelta(
                                                days=1)

                                        show_t = '%s:00' % s['time']
                                        show_t = show_t.split(':')
                                        dtime = datetime.datetime(
                                            dtime.year, dtime.month, dtime.day,
                                            int(show_t[0]), int(show_t[1]), 0)

                                        SourceSchedules.objects.create(
                                            source_id=id,
                                            source_obj=source,
                                            cinema=cinema_obj,
                                            film=film_obj,
                                            dtime=dtime,
                                            sale=sale,
                                        )

                                        cron_data_new += 1
                                        if sale:
                                            cron_data_new_sale += 1
                except httplib.HTTPException:
                    open('%s/httplib_errors.txt' % settings.API_DUMP_PATH,
                         'a').write('%s\n' % url)
            except (urllib2.HTTPError, urllib2.URLError):
                open('%s/httplib_errors.txt' % settings.API_DUMP_PATH,
                     'a').write('urllib2***\t%s\n' % url)

    # cron log
    film_sum = len(set(film_list + film_nof_list))
    end_time = datetime.datetime.now().strftime('%H:%M:%S')
    cron_data = u'%s | %s - %s %s<br />' % (datetime.datetime.now().date(),
                                            start_time, end_time,
                                            u'Импорт сеансов киноход')
    cron_data += u'<br /><b>Получено</b>: %s (с продажей: %s)' % (
        cron_count, cron_count_sale)
    cron_data += u'<br /><b>Новых</b>: %s (с продажей: %s)' % (
        cron_data_new, cron_data_new_sale)
    cron_data += u'<br /><b>Кинотеатров</b>: %s' % cinemas_count
    cron_data += u'<br /><b>Фильмов</b>: %s (не идент: %s)<br />' % (
        film_sum, len(set(film_nof_list)))

    for i in range(50):
        cron_data += u'- '
    process_time = time.time() - t1
    cron_data = u'<br />* %s сек.<br />%s' % (process_time, cron_data)

    open('%s/cron_log_kinohod_schedules.txt' % settings.CRON_LOG_PATH,
         'a').write(cron_data.encode('utf-8'))
    cron_success('json', source.dump, 'schedules', 'Сеансы')
Beispiel #13
0
def get_premierzal_schedules():
    data_nof_film = ''
    noffilms = []

    ignored = get_ignored_films()

    source = ImportSources.objects.get(url='http://www.premierzal.ru/')
    sfilm_clean(source)

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    schedules = get_source_data(source, 'schedule', 'list')

    cities_cinemas = {}
    for i in SourceCinemas.objects.select_related('city').filter(
            source_obj=source):
        if not cities_cinemas.get(i.city.source_id):
            cities_cinemas[i.city.source_id] = {'city': i.city, 'cinemas': []}
        cities_cinemas[i.city.source_id]['cinemas'].append(i)

    for k, v in cities_cinemas.iteritems():
        city_url_encode = urllib.quote(v['city'].name.encode('utf-8'))
        for i in v['cinemas']:
            main_url = '%s?theatre=%s&city=%s' % (source.url, i.source_id,
                                                  city_url_encode)

            main_req = urllib.urlopen(main_url)
            if main_req.getcode() == 200:
                data = BeautifulSoup(main_req.read())
                data = data.find('div', id="films-list")

                if data:
                    dates = []
                    for calendar in data.findAll('table',
                                                 {'class': 'calendar'}):
                        for a in calendar.findAll('a'):
                            href = a.get('href', '')
                            href_dict = dict(cgi.parse_qsl(href))
                            calendar_date = href_dict.get(
                                u'?date', href_dict.get(u'date'))
                            if calendar_date:
                                dates.append({
                                    'date': calendar_date,
                                    'href': href
                                })

                    for ind, d in enumerate(dates):
                        films_blocks = []
                        if ind == 0:
                            films_blocks = data.findAll(
                                'div', {'class': 'film-item-wrapper'})
                        else:
                            url = '%s?date=%s&city=%s&theatre=%s' % (
                                source.url, d['date'], city_url_encode,
                                i.source_id)

                            req = urllib.urlopen(url)
                            if req.getcode() == 200:
                                data = BeautifulSoup(req.read())
                                data = data.find('div', id="films-list")
                                films_blocks = data.findAll(
                                    'div', {'class': 'film-item-wrapper'})
                            time.sleep(random.uniform(0.8, 2.2))

                        for block in films_blocks:
                            title = block.find('div', {
                                'class': 'title'
                            }).find('a')

                            film_name = title.text.encode('utf-8').strip()
                            film_slug = low(
                                del_separator(del_screen_type(film_name)))
                            film_id = film_slug

                            if film_id not in noffilms and film_slug.decode(
                                    'utf-8') not in ignored:

                                obj = films.get(film_id.decode('utf-8'))
                                next_step = checking_obj(obj)

                                if next_step:
                                    if obj:
                                        kid = obj.kid
                                    else:
                                        kid, info = film_identification(
                                            film_slug,
                                            None, {}, {},
                                            source=source)

                                    objt = None
                                    if kid:
                                        create_new, objt = unique_func(
                                            fdict, kid, obj)
                                        if create_new:
                                            objt = create_sfilm(
                                                film_id, kid, source,
                                                film_name)
                                            films[film_id.decode(
                                                'utf-8')] = objt
                                            if not fdict.get(kid):
                                                fdict[kid] = {
                                                    'editor_rel': [],
                                                    'script_rel': []
                                                }
                                            fdict[kid]['script_rel'].append(
                                                objt)
                                    elif not obj:
                                        data_nof_film += xml_noffilm(
                                            film_name, film_slug, None, None,
                                            film_id, info, None, source.id)
                                        noffilms.append(film_id)

                                    if objt:
                                        year, month, day = d['date'].split(
                                            u'-')

                                        for tm in block.findAll(
                                                'div',
                                            {'class': 'seanse-item'}):
                                            for t in tm.text.encode(
                                                    'utf-8').split('|'):
                                                t = re.findall(
                                                    r'\d{2}\:\d{2}', t)
                                                if t:
                                                    hours, minutes = t[
                                                        0].strip().split(':')
                                                    dtime = datetime.datetime(
                                                        int(year), int(month),
                                                        int(day), int(hours),
                                                        int(minutes))

                                                    sch_id = '%s%s%s' % (
                                                        dtime,
                                                        i.source_id.encode(
                                                            'utf-8'), film_id)
                                                    sch_id = sch_id.replace(
                                                        ' ',
                                                        '').decode('utf-8')

                                                    if sch_id not in schedules:
                                                        SourceSchedules.objects.create(
                                                            source_id=sch_id,
                                                            source_obj=source,
                                                            film=objt,
                                                            cinema=i,
                                                            dtime=dtime,
                                                        )
                                                        schedules.append(
                                                            sch_id)
            time.sleep(random.uniform(1.1, 1.8))

    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
Beispiel #14
0
def get_kinobklass_schedules():
    ignored = get_ignored_films()

    data_nof_film = ''
    noffilms = []

    source = ImportSources.objects.get(url='http://kino-bklass.ru/')
    sfilm_clean(source)

    city_name = 'Серпухов'
    cinema_name = 'Кинотеатр  в ТРК &#034;Б-Класс&#034;'
    city_slug = low(del_separator(city_name))
    cinema_slug = low(del_separator(cinema_name))

    city = City.objects.get(name__name=city_name, name__status=1)
    cinema = Cinema.objects.get(name__name=cinema_name,
                                name__status=1,
                                city=city)

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    schedules = get_source_data(source, 'schedule', 'list')

    city_obj, city_created = SourceCities.objects.get_or_create(
        source_id=city_slug,
        source_obj=source,
        defaults={
            'source_id': city_slug,
            'source_obj': source,
            'city': city,
            'name': city_name,
        })

    cinema_obj, cinema_created = SourceCinemas.objects.get_or_create(
        source_id=cinema_slug,
        source_obj=source,
        defaults={
            'source_id': cinema_slug,
            'source_obj': source,
            'city': city_obj,
            'cinema': cinema,
            'name': cinema_name,
        })

    today = datetime.datetime.now().date()
    to = today + datetime.timedelta(days=6)
    delta = to - today

    for day in range(delta.days + 1):
        d = today + datetime.timedelta(days=day)
        url = '%s?date=%s' % (source.url, d.strftime("%Y%m%d"))

        req = urllib.urlopen(url)
        if req.getcode() == 200:
            data = BeautifulSoup(req.read(), from_encoding="utf-8")
            table = data.findAll('table', id='rasp', limit=1)[0]
            for td in table.findAll('td', colspan='10'):
                full_url = td.a.get('href')
                film_id = full_url.replace('http://kino-bklass.ru/films/',
                                           '').replace('/', '').encode('utf-8')
                film_name = td.a.h3.string.strip().split('    ')[0].encode(
                    'utf-8')
                film_slug = del_screen_type(low(del_separator(film_name)))

                if film_id not in noffilms and film_slug.decode(
                        'utf-8') not in ignored:

                    obj = films.get(film_id.decode('utf-8'))
                    next_step = checking_obj(obj)

                    if next_step:
                        if obj:
                            kid = obj.kid
                        else:
                            kid, info = film_identification(film_slug,
                                                            None, {}, {},
                                                            source=source)

                        objt = None
                        if kid:
                            create_new, objt = unique_func(fdict, kid, obj)
                            if create_new:
                                objt = create_sfilm(film_id, kid, source,
                                                    film_name)
                                films[film_id.decode('utf-8')] = objt
                                if not fdict.get(kid):
                                    fdict[kid] = {
                                        'editor_rel': [],
                                        'script_rel': []
                                    }
                                fdict[kid]['script_rel'].append(objt)
                        elif not obj:
                            data_nof_film += xml_noffilm(
                                film_name, film_slug, None, None, film_id,
                                info, full_url.encode('utf-8'), source.id)
                            noffilms.append(film_id)

                        if objt:
                            tr = td.find_next('tr')
                            times = []
                            for time_tag in tr.findAll('td'):
                                t = None

                                if time_tag.string:
                                    t = time_tag.string.strip().encode('utf-8')
                                if time_tag.b:
                                    t = time_tag.b.string.strip().encode(
                                        'utf-8')
                                if t:
                                    try:
                                        hours, minutes = t.split(':')
                                    except ValueError:
                                        try:
                                            hours, minutes = t.split('-')
                                        except ValueError:
                                            hours, minutes = t.split('^')

                                    if hours == '24':
                                        hours, minutes = (23, 59)

                                    year, month, day = str(d).split('-')
                                    dtime = datetime.datetime(
                                        int(year), int(month), int(day),
                                        int(hours), int(minutes))

                                    sch_id = '%s%s%s%s' % (dtime, cinema_slug,
                                                           city_slug, film_id)
                                    sch_id = sch_id.replace(' ',
                                                            '').decode('utf-8')

                                    if sch_id not in schedules:
                                        SourceSchedules.objects.create(
                                            source_id=sch_id,
                                            source_obj=source,
                                            film=objt,
                                            cinema=cinema_obj,
                                            dtime=dtime,
                                        )
                                        schedules.append(sch_id)
        time.sleep(3.0)

    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
Beispiel #15
0
def get_premierzal_cinemas():
    source = ImportSources.objects.get(url='http://www.premierzal.ru/')

    cinemas = get_source_data(source, 'cinema', 'list')

    cities_dict = get_source_data(source, 'city', 'dict')

    cinemas_dict = {}
    for i in Cinema.objects.all():
        cinemas_dict[i.code] = i

    ignored_cinemas = get_ignored_cinemas()

    data_nof_cinema = ''

    city = cities_dict.values()[0]

    body = urllib.urlencode({
        'city': city.name.encode('utf-8'),
    })

    url = '%stheatres?%s' % (source.url, body)

    req = urllib.urlopen(url)
    if req.getcode() == 200:
        data = BeautifulSoup(req.read())

        blocks = []

        block1 = data.find('div', {'class': 'this_city_theatres'})
        block2 = data.find('div', {'class': 'other_city_theatres'})

        if block1:
            blocks.append(block1)

        if block2:
            blocks.append(block2)

        for ind, block in enumerate(blocks):
            for a in block.findAll('a'):
                cinema_name = a.text.encode('utf-8').strip().replace('"', '')
                cinema_id = a.get('href').replace('/theatres/',
                                                  '').replace('/', '')

                if ind == 0:
                    city_obj = city
                else:
                    city_name, cinema_name = cinema_name.split(',')
                    cinema_name = cinema_name.strip()
                    city_slug = low(del_separator(city_name.strip()))
                    city_obj = cities_dict.get(city_slug.decode('utf-8'))

                cinema_slug = low(del_separator(cinema_name))

                if city_obj:
                    cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'),
                                                city_obj.city.kid)

                    if cinema_id.decode(
                            'utf-8'
                    ) not in cinemas and cinema_ig_id not in ignored_cinemas:

                        filter1 = {
                            'name__name': cinema_slug,
                            'name__status': 2,
                            'city__id': city_obj.city_id
                        }

                        cinema = cinema_identification(cinema_slug, filter1)

                        cin_obj = cinemas_dict.get(cinema)
                        if cin_obj:
                            SourceCinemas.objects.create(
                                source_id=cinema_id,
                                source_obj=source,
                                city=city_obj,
                                cinema=cin_obj,
                                name=cinema_name,
                            )
                            cinemas.append(cinema_id.decode('utf-8'))
                        else:
                            data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                cinema_name, cinema_slug,
                                city_obj.name.encode('utf-8'),
                                city_obj.city.kid)

    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)
    cron_success('html', source.dump, 'cinemas', 'Кинотеатры')
Beispiel #16
0
def get_planeta_schedules():
    data_nof_hall = ''

    source = ImportSources.objects.get(url='http://planeta-kino.com.ua/')

    planeta_schedules = get_source_data(source, 'schedule', 'list')
    planeta_cities_dict = get_source_data(source, 'city', 'dict')
    planeta_cinemas_dict = get_source_data(source, 'cinema', 'dict')
    planeta_films_dict = get_source_data(source, 'film', 'dict')

    nof_list = []

    for i in planeta_kino_urls:
        xml = open(
            '%s/dump_planetakino_%s.xml' % (settings.API_DUMP_PATH, i['city']),
            'r')
        xml_data = BeautifulSoup(xml.read(), from_encoding="utf-8")
        xml.close()

        for day in xml_data.findAll('day'):
            release_date = day['date'].encode('utf-8')
            for show in day.findAll('show'):
                cinema_id = show['theatre-id'].encode('utf-8')
                city_id = cinema_id.split('-')[1].encode('utf-8')
                city = planeta_cities_dict.get(city_id)
                cinema = planeta_cinemas_dict.get(cinema_id)
                film_id = show['movie-id']
                film = planeta_films_dict.get(film_id)

                if city and cinema and film:
                    time_data = show['time'].encode('utf-8')
                    technology = show['technology'].encode('utf-8')
                    hall_id = show['hall-id'].encode('utf-8')

                    d = release_date.split('-')
                    t = time_data.split(':')
                    dtimedate = datetime.datetime(int(d[0]), int(d[1]),
                                                  int(d[2]), int(t[0]),
                                                  int(t[1]))

                    planeta_id = '%s%s%s%s%s' % (dtimedate, hall_id, cinema_id,
                                                 city_id, film_id)
                    planeta_id = planeta_id.replace(' ', '')

                    id = '%s%s%s' % (hall_id, cinema_id, city_id)

                    if planeta_id not in planeta_schedules and id not in nof_list:

                        # идентификация зала
                        hall_obj = Hall.objects.filter(
                            name__name=hall_id,
                            cinema=cinema.cinema).distinct('pk')
                        # если нашел
                        if hall_obj.count() == 1:
                            # все объекты идентифицированны, добавляю к идентифицированным
                            SourceSchedules.objects.get_or_create(
                                source_id=planeta_id,
                                source_obj=source,
                                defaults={
                                    'source_id': planeta_id,
                                    'source_obj': source,
                                    'film': film,
                                    'cinema': cinema,
                                    'hall': hall_obj[0].kid,
                                    'dtime': dtimedate,
                                })
                        # если зал ненайден
                        else:
                            nof_list.append(id)
                            # если такого тега нет в ненайденных, то добавляю
                            data_nof_hall += '<hall city="%s" city_kid="%s" cinema="%s" cinema_kid="%s" name="%s" slug="%s" id="%s"></hall>' % (
                                city.name.encode('utf-8'), city.city.kid,
                                cinema.name.encode('utf-8'),
                                cinema.cinema.code, hall_id, hall_id, id)

    create_dump_file('%s_nof_hall' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_hall)
    cron_success('xml', source.dump, 'schedules', 'Сеансы')
Beispiel #17
0
def get_planeta_cities_cinemas():
    '''
    Получение xml данных сеансов от PlanetaKino
    '''
    source = ImportSources.objects.get(url='http://planeta-kino.com.ua/')
    planeta_cities_dict = get_source_data(source, 'city', 'dict')
    planeta_cinemas = get_source_data(source, 'cinema', 'list')

    data_nof_city = ''
    data_nof_cinema = ''

    for i in planeta_kino_urls:
        city_name = i['city_name']
        city_slug = low(del_separator(city_name))
        city_id = i['city']

        req = urllib.urlopen(i['url'])
        if req.getcode() == 200:
            f = open(
                '%s/dump_planetakino_%s.xml' %
                (settings.API_DUMP_PATH, city_id), 'w')
            f.write(str(req.read()))
            f.close()

        city_obj = planeta_cities_dict.get(city_id)
        if not city_obj:
            city = City.objects.filter(name__name=city_slug,
                                       name__status=2).distinct('pk')
            if city.count() == 1:
                city_obj = SourceCities.objects.create(
                    source_id=city_id,
                    source_obj=source,
                    city=city[0],
                    name=city_name,
                )
            else:
                if 'slug="%s"' % city_slug not in data_nof_city:
                    data_nof_city += '<city name="%s" slug="%s"></city>' % (
                        city_name, city_slug)

        if city_obj:
            city_kid = city_obj.city.kid
            cinema_name = 'Планета Кино IMAX'
            cinema_slug = low(del_separator(cinema_name))
            cinema_id = 'imax-%s' % i['city'] if i[
                'city'] == 'kiev' else 'pk-%s' % i['city']

            if cinema_id not in planeta_cinemas:
                filter1 = {
                    'name__name': cinema_slug,
                    'name__status': 2,
                    'city__kid': city_kid
                }
                cinema_kid = cinema_identification(cinema_slug, filter1)
                if cinema_kid:
                    cinema = Cinema.objects.get(code=cinema_kid)
                    SourceCinemas.objects.create(
                        source_id=cinema_id,
                        source_obj=source,
                        city=city_obj,
                        cinema=cinema,
                        name=cinema_name,
                    )
                else:
                    tags = 'slug="%s" city_kid="%s"' % (cinema_slug, city_kid)
                    if tags not in data_nof_cinema:
                        data_nof_cinema += '<cinema name="%s" city="%s" slug="%s" city_kid="%s"></cinema>' % (
                            cinema_name, city_name, cinema_slug, city_kid)

    create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_city)
    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)
    cron_success('xml', source.dump, 'cities', 'Города')
    cron_success('xml', source.dump, 'cinemas', 'Кинотеатры')
Beispiel #18
0
def get_okinoua_schedules():
    """
    Парсинг сеансов Украины
    """
    source = ImportSources.objects.get(url='http://www.okino.ua/')

    # Получаем словарь идентифицированных фильмов OkinoUA
    okinoua_films_dict = get_source_data(source, 'film', 'dict')
    # Получаем словарь со списком идентифицированных городов OkinoUA
    okinoua_cities_dict = get_source_data(source, 'city', 'dict')
    # Получаем словарь со списком идентифицированных кинотеатров OkinoUA
    okinoua_cinemas_dict = get_source_data(source, 'cinema', 'dict')
    # Получаем список идентифицированных сенсов OkinoUA
    okinoua_schedules = get_source_data(source, 'schedule', 'list')

    counter1 = 0
    for city_id, city_obj in okinoua_cities_dict.iteritems():
        counter1 += 1
        url = '%s%s/' % (source.url, city_id)
        req = urllib.urlopen(url)
        dates = []
        if req.getcode() == 200:
            page = BeautifulSoup(req.read(), from_encoding="utf-8")
            # если в городе есть сеансы
            item = page.find('div', {'class': 'item0'})
            if item:
                # получаю даты на которые есть расписание
                date_div = page.find('div', id='afisha-date')
                dates = [i.get('href').strip() for i in date_div.findAll('a')]

        counter = 0
        for date in dates:
            counter += 1
            url2 = '%s%s' % (url, date)
            req2 = urllib.urlopen(url2)
            if req2.getcode() == 200:
                page2 = BeautifulSoup(req2.read(), from_encoding="utf-8")
                for div in page2.findAll('div', {'class': 'item0'}):
                    cinema_tag = div.find('h3')
                    cinema_id = cinema_tag.a.get('href').replace(
                        '/', '').encode('utf-8')
                    cinema_obj = okinoua_cinemas_dict.get(cinema_id)
                    if cinema_obj:
                        for film in div.findAll('div', {'class': 'item2'}):
                            if film.div.div.a:
                                film_name = film.div.div.a.string.encode(
                                    'utf-8')
                                film_id = film.div.div.a.get('href').replace(
                                    '/film/', '').replace('/',
                                                          '').encode('utf-8')
                            else:
                                film_name = film.div.div.string.strip().encode(
                                    'utf-8')
                                film_id = None

                            film_name_slug = low(
                                del_separator(del_screen_type(film_name)))
                            if not film_id:
                                film_id = film_name_slug

                            film_obj = okinoua_films_dict.get(film_id)

                            if film_obj:
                                showtime = film.find('div',
                                                     {'class': 'showtime'})
                                for time_tag in showtime.findAll('span'):
                                    hours, minutes = time_tag.string.encode(
                                        'utf-8').split(':')
                                    year, month, day = date.replace(
                                        '?date=', '').split('-')
                                    dtime = datetime.datetime(
                                        int(year), int(month), int(day),
                                        int(hours), int(minutes), 0)
                                    id = '%s%s%s%s' % (dtime, cinema_id,
                                                       city_id.encode('utf-8'),
                                                       film_id)
                                    id = id.replace(' ', '')
                                    if id.decode(
                                            'utf-8') not in okinoua_schedules:
                                        SourceSchedules.objects.create(
                                            source_id=id,
                                            source_obj=source,
                                            cinema=cinema_obj,
                                            film=film_obj,
                                            dtime=dtime,
                                        )
                                        okinoua_schedules.append(id)
            if counter % 4 == 0:
                time.sleep(random.uniform(1.0, 3.0))
        if counter1 % 4 == 0:
            time.sleep(random.uniform(1.0, 3.0))
    cron_success('html', 'okinoua', 'schedules', 'Сеансы')
Beispiel #19
0
def get_okinoua_films():
    """
    Парсинг фильмов Украины
    """
    xml = open('%s/dump_okinoua_nof_film.xml' % settings.NOF_DUMP_PATH, 'r')
    xml_data = BeautifulSoup(xml.read(), from_encoding="utf-8")
    xml.close()

    films_slugs = []
    for i in xml_data.findAll('film'):
        slug = i.get('slug_ru')
        films_slugs.append(slug)

    source = ImportSources.objects.get(url='http://www.okino.ua/')
    data_nof_films = ''
    not_founded_films = []

    # Получаем словарь идентифицированных фильмов OkinoUA
    okinoua_films = get_source_data(source, 'film', 'list')
    # Получаем словарь со списком идентифицированных городов OkinoUA
    okinoua_cities_dict = get_source_data(source, 'city', 'dict')
    # Получаем словарь со списком идентифицированных кинотеатров OkinoUA
    okinoua_cinemas_dict = get_source_data(source, 'cinema', 'dict')

    counter = 0
    for city_id, city_obj in okinoua_cities_dict.iteritems():
        counter += 1
        url = '%s%s/' % (source.url, city_id)
        req = urllib.urlopen(url)
        dates = []
        if req.getcode() == 200:
            page = BeautifulSoup(req.read(), from_encoding="utf-8")

            for div in page.findAll('div', {'class': 'item0'}):
                for film in div.findAll('div', {'class': 'item2'}):
                    alt_name = None
                    if film.div.div.a:
                        film_name = film.div.div.a.string.encode('utf-8')
                        film_a = film.div.div.a.get('href')
                        film_id = film_a.replace('/film/', '').replace(
                            '/', '').encode('utf-8')
                        full_url = '%sfilm/%s' % (source.url, film_id)
                        req_name = urllib.urlopen(full_url)
                        if req_name.getcode() == 200:
                            filmpage = BeautifulSoup(req_name.read(),
                                                     from_encoding="utf-8")
                            title = filmpage.find('div', {'class': 'item'})
                            if title.h4:
                                alt_name = title.h4.text.encode('utf-8')
                                alt_name = re.sub(r'\(.*?\)', '',
                                                  alt_name).strip()
                    else:
                        film_name = film.div.div.string.strip().encode('utf-8')
                        film_id = None

                    film_name_slug = low(
                        del_separator(del_screen_type(film_name)))
                    if not film_id:
                        film_id = film_name_slug.decode('utf-8')

                    if film_id not in okinoua_films:
                        kid, info = film_identification(film_name_slug,
                                                        None, {}, {},
                                                        source=source)
                        if kid:
                            film_obj, created = SourceFilms.objects.get_or_create(
                                source_id=film_id,
                                source_obj=source,
                                defaults={
                                    'source_id': film_id,
                                    'source_obj': source,
                                    'name': film_name,
                                    'kid': kid,
                                    'name_alter': alt_name,
                                })
                        else:
                            slug_tag = 'slug_ru="%s"' % film_name_slug
                            if slug_tag not in data_nof_films and film_name_slug.decode(
                                    'utf-8') not in films_slugs:
                                data_nof_films += xml_noffilm(
                                    film_name, film_name_slug, None, None,
                                    film_id.encode('utf-8'), info,
                                    full_url.encode('utf-8'), source.id)
                        okinoua_films.append(film_id)
        if counter % 4 == 0:
            time.sleep(random.uniform(1.0, 3.0))

    xml_data = str(xml_data).replace('<html><head></head><body><data>',
                                     '').replace('</data></body></html>', '')
    xml_data = '<data>%s%s</data>' % (xml_data, data_nof_films)

    create_dump_file('okinoua_nof_film', settings.NOF_DUMP_PATH, xml_data)
    cron_success('html', 'okinoua', 'films', 'Фильмы')
Beispiel #20
0
def get_okinoua_cinemas():
    """
    Парсинг кинотеатров Украины
    """
    source = ImportSources.objects.get(url='http://www.okino.ua/')

    # Получаем список идентифицированных кинотеатров OkinoUA
    cinemas_ids = get_source_data(source, 'cinema', 'list')
    data_nof_cinema = ''

    # Получаем словарь со списком идентифицированных городов OkinoUA
    okinoua_cities_dict = get_source_data(source, 'city', 'dict')

    cinemas = Cinema.objects.all()
    cinemas_dict = {}
    for i in cinemas:
        cinemas_dict[i.code] = i

    counter = 0
    # Открываем ссылку, если она доступна и считываем ее BeautifulSoup'ом
    for city_id, city_obj in okinoua_cities_dict.iteritems():
        counter += 1
        url = '%s%s/' % (source.url, city_id)
        req = urllib.urlopen(url)
        if req.getcode() == 200:
            page = BeautifulSoup(req.read(), from_encoding="utf-8")
            # Находим все теги с городами и считываем из них id и названия городов
            for div in page.findAll('div', {'class': 'item0'}):
                cinema_tag = div.find('h3')
                cinema_id = cinema_tag.a.get('href').replace('/', '')
                cinema_name = cinema_tag.a.string.encode('utf-8')
                cinema_slug = low(del_separator(cinema_name))
                if cinema_id not in cinemas_ids:
                    filter = {
                        'name__name': cinema_slug,
                        'name__status': 2,
                        'city__id': city_obj.city_id
                    }
                    cinema_kid = cinema_identification(cinema_slug, filter)
                    if cinema_kid:
                        try:
                            cinema = Cinema.objects.get(code=cinema_kid)
                            cinema_obj = SourceCinemas.objects.create(
                                source_id=cinema_id,
                                source_obj=source,
                                city=city_obj,
                                cinema=cinema,
                                name=cinema_name,
                            )
                        except Cinema.DoesNotExist:
                            pass
                    else:
                        if 'slug="%s"' % cinema_slug not in data_nof_cinema:
                            data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                cinema_name, cinema_slug,
                                city_obj.name.encode('utf-8'),
                                city_obj.city.kid)
        if counter % 4 == 0:
            time.sleep(random.uniform(1.0, 3.0))

    create_dump_file('okinoua_nof_cinema', settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)
    cron_success('html', 'okinoua', 'cinemas', 'Укр. кинотеатры')
Beispiel #21
0
def get_kinohod_cinemas():
    #    print "BEGIN get_kinohod_cinemas()"
    t1 = time.time()
    start_time = datetime.datetime.now().strftime('%H:%M:%S')

    cron_data_new = ''
    cron_data_nof = ''
    cron_count = 0

    main_url = 'http://www.kinohod.ru/api/rest/partner/v1/cinemas?apikey=%s' % SERVER_API_KEY

    source = ImportSources.objects.get(url='http://kinohod.ru/')
    kinohod_cinemas = get_source_data(source, 'cinema', 'list')
    kinohod_cities_dict = get_source_data(source, 'city', 'dict')

    cinemass = Cinema.objects.all()
    cinemass_dict = {}
    for i in cinemass:
        cinemass_dict[i.code] = i

    count = 0
    data_nof_cinema = ''
    for cid, kinohod_city in kinohod_cities_dict.iteritems():
        try:
            url = '%s&city=%s' % (main_url, cid)
            req = urllib.urlopen(url)
            if req.getcode() == 200:
                json_data = req.read()
                data = json.loads(json_data)
                for i in data:
                    cron_count += 1
                    id = str(i['id']).decode('utf-8')
                    if id not in kinohod_cinemas:
                        name = i['title']
                        name_slug = del_screen_type(name.encode('utf-8'))
                        name_slug = low(del_separator(name_slug))
                        short_name = i['shortTitle']
                        short_name_slug = del_screen_type(
                            short_name.encode('utf-8'))
                        short_name_slug = low(del_separator(short_name_slug))

                        filter1 = {
                            'name__name': name_slug,
                            'name__status': 2,
                            'city__id': kinohod_city.city_id
                        }
                        filter2 = {
                            'name__name': short_name_slug,
                            'name__status': 2,
                            'city__id': kinohod_city.city_id
                        }
                        cinema_kid = cinema_identification(
                            short_name_slug, filter1, filter2)
                        cin_obj = cinemass_dict.get(cinema_kid)
                        if cin_obj:
                            SourceCinemas.objects.create(
                                source_id=id,
                                source_obj=source,
                                city=kinohod_city,
                                cinema=cin_obj,
                                name=name,
                                name_alter=short_name,
                            )
                            cron_data_new += '%s<br />' % short_name.encode(
                                'utf-8')
                        else:
                            count += 1
                            name_city = kinohod_city.name
                            data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                short_name.encode('utf-8'), short_name_slug,
                                name_city.encode('utf-8'),
                                kinohod_city.city.kid)
                            cron_data_nof += '%s<br />' % short_name.encode(
                                'utf-8')
                        kinohod_cinemas.append(id)
        except IOError:
            open('%s/ddd.txt' % settings.API_DUMP_PATH,
                 'a').write(str(url) + '\n')
    data_nof_cinema += '<sum>%s</sum>' % count
    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)

    # cron log
    end_time = datetime.datetime.now().strftime('%H:%M:%S')
    cron_data = '%s | %s - %s %s\n' % (datetime.datetime.now().date(),
                                       start_time, end_time,
                                       'Импорт кинотеатров киноход')
    cron_data += '<br /><b>Обработано</b>: %s' % cron_count
    cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new
    cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof
    for i in range(50):
        cron_data += '- '
    process_time = time.time() - t1
    cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data)
    open('%s/cron_log_kinohod_cinemas.txt' % settings.CRON_LOG_PATH,
         'a').write(cron_data)
    cron_success('json', source.dump, 'cinemas', 'Кинотеатры')
Beispiel #22
0
def get_rottentomatoes_films(everyday=True):
    def get_critic(block):
        critic = block.findAll('div', id="scoreStats", limit=1)

        if critic:
            critic = critic[0].findAll('div')

            average = critic[0].find('span', {
                'class': 'subtle superPageFontColor'
            }).next_sibling.string.strip()
            reviews = critic[1].findAll('span', limit=2)[1].text.strip()
            fresh = critic[2].find('span', {
                'class': 'subtle superPageFontColor'
            }).next_sibling.string.strip()
            rotten = critic[3].find('span', {
                'class': 'subtle superPageFontColor'
            }).next_sibling.string.strip()

            return '%s;%s;%s;%s' % (average.replace(
                '/10', ''), reviews, fresh, rotten)
        else:
            return 'N/A;0;0;0'
        '''
        critic = block.findAll('p', {'class': 'critic_stats'}, limit=1)[0]
        average, reviews = critic.findAll('span', limit=2)
        try:
            fresh, rotten = reviews.next_sibling.next_sibling.encode('utf-8').strip().split(' | ')
        except AttributeError:
            return 'N/A;0;0;0'
            
        fresh = fresh.replace('Fresh:','').strip()
        rotten = rotten.replace('Rotten:','').strip()
        average = average.string.encode('utf-8').split('/')[0]
        reviews = reviews.string.encode('utf-8')

        return '%s;%s;%s;%s' % (average, reviews, fresh, rotten)
        '''

    source = ImportSources.objects.get(url='http://www.rottentomatoes.com/')
    sfilm_clean(source)

    noffilms = []
    data_nof_film = ''

    filter = {'source_obj': source}

    if everyday:
        today = datetime.datetime.today().date()
        day7 = today + datetime.timedelta(days=7)
        today = today - datetime.timedelta(days=30)
        filter['text__gte'] = today
        filter['text__lt'] = day7

    exists = get_source_data(source, 'film', 'list')

    films = {}
    source_films = SourceFilms.objects.filter(**filter)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    ignored = get_ignored_films()

    opener = urllib2.build_opener()
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1',
    }
    opener.addheaders = headers.items()

    updated = []

    for k, f in films.items():
        film_url = '%s%s' % (source.url, k)
        req = opener.open(film_url)
        if req.getcode() == 200:
            data = BeautifulSoup(req.read(), from_encoding="utf-8")
            extra = get_critic(data)
            f.extra = extra
            f.save()
            updated.append(k)
        time.sleep(1)

    u = 'http://www.rottentomatoes.com/api/private/v1.0/m/list/find?page=1&limit=50&type=opening&minTomato=0&maxTomato=100&minPopcorn=0&maxPopcorn=100&services=&genres=1%3B2%3B4%3B5%3B6%3B8%3B9%3B10%3B11%3B13%3B14%3B18&sortBy=popularity&certified=false'

    req = opener.open(u)
    if req.getcode() == 200:
        data = json.loads(req.read(), encoding="latin-1")

        for i in data['results']:
            title = i['title'].encode('utf-8')
            title_slug = low(del_separator(title))

            url = i['url'].lstrip('/')

            full_url = '%s%s' % (source.url, url)

            if url not in exists and url not in noffilms:
                if title_slug.decode(
                        'utf-8') not in ignored and url not in updated:

                    time.sleep(1)
                    req2 = opener.open(full_url)
                    if req2.getcode() == 200:

                        data2 = BeautifulSoup(req2.read(),
                                              from_encoding="utf-8")

                        year_block = data2.find('h1',
                                                {'class': 'title hidden-xs'})
                        if not year_block:
                            year_block = data2.find('h1', id='movie-title')

                        year_tmp = year_block.find('span', {
                            'class': 'h3 year'
                        }).text.encode('utf-8')

                        year = int(year_tmp.replace('(', '').replace(')', ''))

                        release_date = data2.find('td',
                                                  itemprop="datePublished")
                        if release_date:
                            release_date = release_date.get('content')

                        extra = get_critic(data2)

                        obj = films.get(url)
                        next_step = checking_obj(obj)

                        if next_step:
                            if obj:
                                kid = obj.kid
                                obj.extra = extra
                                obj.save()
                            else:
                                kid, info = film_identification(None,
                                                                title_slug, {},
                                                                {},
                                                                year,
                                                                source=source)

                            objt = None
                            if kid:
                                create_new, objt = unique_func(fdict, kid, obj)
                                if create_new:
                                    new = create_sfilm(url,
                                                       kid,
                                                       source,
                                                       title,
                                                       txt=release_date,
                                                       extra=extra)
                                    films[url] = new
                                    if not fdict.get(kid):
                                        fdict[kid] = {
                                            'editor_rel': [],
                                            'script_rel': []
                                        }
                                    fdict[kid]['script_rel'].append(new)
                            elif not obj:
                                data_nof_film += xml_noffilm(
                                    title, title_slug, None, None,
                                    url.encode('utf-8'), info,
                                    full_url.encode('utf-8'), source.id)
                                noffilms.append(url)

    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'films', 'Фильмы, рейтинг')
Beispiel #23
0
def get_kinohod_films():
    #    print "BEGIN get_kinohod_films()"

    ignored = get_ignored_films()

    t1 = time.time()
    start_time = datetime.datetime.now().strftime('%H:%M:%S')

    cron_data_new = ''
    cron_data_nof = ''
    cron_count = 0
    noffilms = []

    source = ImportSources.objects.get(url='http://kinohod.ru/')

    sfilm_clean(source)

    kinohod_cities = get_source_data(source, 'city', 'list')

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    data_nof_films = ''
    main_url = 'http://www.kinohod.ru/api/rest/partner/v1/movies?apikey=%s' % SERVER_API_KEY
    for city_id in kinohod_cities:
        try:
            url = '%s&city=%s' % (main_url, city_id)
            req = urllib.urlopen(url)

            if req.getcode() == 200:
                json_data = req.read()
                data = json.loads(json_data)
                for i in data:
                    cron_count += 1
                    film_id = str(i['id']).decode('utf-8')
                    year = int(
                        i['productionYear']) if i['productionYear'] else None
                    name_ru = i['title'].encode('utf-8')
                    name_ru_slug = low(del_separator(del_screen_type(name_ru)))
                    full_url = '%smovie/%s/' % (source.url, film_id)
                    name_en = None
                    name_en_slug = None
                    if i['originalTitle']:
                        name_en = i['originalTitle'].encode('utf-8')
                        name_en_slug = low(
                            del_separator(del_screen_type(name_en)))

                    if year and name_ru_slug.decode(
                            'utf-8'
                    ) not in ignored and film_id not in noffilms:

                        obj = films.get(film_id)
                        next_step = checking_obj(obj)

                        if next_step:
                            try:
                                if obj:
                                    kid = obj.kid
                                else:
                                    kid, info = film_identification(
                                        name_ru_slug,
                                        name_en_slug, {}, {},
                                        year=year,
                                        source=source)

                                objt = None
                                if kid:
                                    create_new, objt = unique_func(
                                        fdict, kid, obj)
                                    if create_new:
                                        new = create_sfilm(film_id,
                                                           kid,
                                                           source,
                                                           name_ru,
                                                           name_alt=name_en,
                                                           year=year)
                                        films[film_id] = new
                                        if not fdict.get(kid):
                                            fdict[kid] = {
                                                'editor_rel': [],
                                                'script_rel': []
                                            }
                                        fdict[kid]['script_rel'].append(new)
                                        cron_data_new += '%s<br />' % name_ru
                                elif not obj:
                                    if not name_en:
                                        name_en = '*'
                                        name_en_slug = '*'
                                    data_nof_films += xml_noffilm(
                                        name_ru, name_ru_slug,
                                        name_en, name_en_slug,
                                        film_id.encode('utf-8'), info,
                                        full_url.encode('utf-8'), source.id)
                                    noffilms.append(film_id)
                                    cron_data_nof += '%s<br />' % name_ru
                            except db.backend.Database._mysql.OperationalError:
                                pass

        except IOError:
            open('%s/ddd.txt' % settings.API_DUMP_PATH,
                 'a').write(str(url) + '\n')
    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_films)

    # cron log
    end_time = datetime.datetime.now().strftime('%H:%M:%S')
    cron_data = '%s | %s - %s %s\n' % (datetime.datetime.now().date(),
                                       start_time, end_time,
                                       'Импорт фильмов киноход')
    cron_data += '<br /><b>Обработано</b>: %s' % cron_count
    cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new
    cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof
    for i in range(50):
        cron_data += '- '
    process_time = time.time() - t1
    cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data)
    open('%s/cron_log_kinohod_films.txt' % settings.CRON_LOG_PATH,
         'a').write(cron_data)
    cron_success('json', source.dump, 'films', 'Фильмы')
def get_oreanda_and_spartak():
    ignored = get_ignored_films()

    city_name = 'Ялта'

    city_slug = low(del_separator(city_name))

    xdata = (
        {
            'url': 'http://yaltakino.com/Oreanda/',
            'eng': 'Oreanda',
            'ru': 'Ореанда'
        },
        {
            'url': 'http://yaltakino.com/Spartak/',
            'eng': 'Spartak',
            'ru': 'Спартак'
        },
    )

    for data in xdata:

        data_nof_film = ''
        noffilms = []

        source = ImportSources.objects.get(url=data['url'])
        sfilm_clean(source)

        films = {}
        source_films = SourceFilms.objects.filter(source_obj=source)
        for i in source_films:
            films[i.source_id] = i
        fdict = get_all_source_films(source, source_films)

        schedules = get_source_data(source, 'schedule', 'list')

        city = City.objects.get(name__name=city_name, name__status=1)

        city_obj, city_created = SourceCities.objects.get_or_create(
            source_id=city_slug,
            source_obj=source,
            defaults={
                'source_id': city_slug,
                'source_obj': source,
                'city': city,
                'name': city_name,
            })

        cinema_name = data['ru']
        cinema_eng = data['eng']

        cinema_slug = low(del_separator(cinema_name))
        cinema = Cinema.objects.get(name__name=cinema_name,
                                    name__status=1,
                                    city=city)
        cinema_obj, cinema_created = SourceCinemas.objects.get_or_create(
            source_id=cinema_slug,
            source_obj=source,
            defaults={
                'source_id': cinema_slug,
                'source_obj': source,
                'city': city_obj,
                'cinema': cinema,
                'name': cinema_name,
            })

        main_url = '%sschedule/' % source.url

        req = urllib.urlopen(main_url)
        if req.getcode() == 200:
            data = BeautifulSoup(req.read())

            main = data.find('td', {'class': 'contentplaceholder'})

            for div in main.findAll('div', {'class': 'scheduleDayCaption'}):
                sess_date, sess_day = div.text.split(' / ')

                day, month = sess_date.split()
                month = get_month(month.encode('utf-8'))
                year = datetime.datetime.now().year

                table = div.find_next('table')

                for tr in table.findAll('tr'):

                    if tr.find('td', {'class': 'scheduleTime'}):
                        hour, minute = tr.find('td', {
                            'class': 'scheduleTime'
                        }).text.split(':')
                        film = tr.find('a', {'class': 'scheduleLink'})
                        film_url = film.get('href')

                        full_url = 'http://yaltakino.com%s' % film_url
                        film_id = film_url.replace('/%s/?filmid=' % cinema_eng,
                                                   '')
                        film_name = del_screen_type(
                            film.text.encode('utf-8')).strip()
                        film_slug = low(del_separator(film_name))

                        if film_id.encode(
                                'utf-8') not in noffilms and film_slug.decode(
                                    'utf-8') not in ignored:
                            obj = films.get(film_id)
                            next_step = checking_obj(obj)

                            if next_step:
                                if obj:
                                    kid = obj.kid
                                else:
                                    kid, info = film_identification(
                                        film_slug, None, {}, {}, source=source)

                                objt = None
                                if kid:
                                    create_new, objt = unique_func(
                                        fdict, kid, obj)
                                    if create_new:
                                        objt = create_sfilm(
                                            film_id, kid, source, film_name)
                                        films[film_id] = objt
                                        if not fdict.get(kid):
                                            fdict[kid] = {
                                                'editor_rel': [],
                                                'script_rel': []
                                            }
                                        fdict[kid]['script_rel'].append(objt)
                                elif not obj:
                                    data_nof_film += xml_noffilm(
                                        film_name, film_slug, None, None,
                                        film_id.encode('utf-8'), info,
                                        full_url.encode('utf-8'), source.id)
                                    noffilms.append(film_id.encode('utf-8'))

                                if objt:
                                    dtime = datetime.datetime(
                                        year, int(month), int(day), int(hour),
                                        int(minute))

                                    sch_id = '%s%s%s%s' % (
                                        dtime, cinema_slug, city_slug,
                                        film_id.encode('utf-8'))
                                    sch_id = sch_id.replace(' ',
                                                            '').decode('utf-8')

                                    if sch_id not in schedules:
                                        SourceSchedules.objects.create(
                                            source_id=sch_id,
                                            source_obj=source,
                                            film=objt,
                                            cinema=cinema_obj,
                                            dtime=dtime,
                                        )
                                        schedules.append(sch_id)

        create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                         '<data>%s</data>' % data_nof_film)
        cron_success('html', source.dump, 'schedules', 'Сеансы')
Beispiel #25
0
def get_kinohod_cities():
    #    print "BEGIN get_kinohod_cities()"
    t1 = time.time()
    start_time = datetime.datetime.now().strftime('%H:%M:%S')

    cron_data_new = ''
    cron_data_nof = ''
    cron_count = 0

    url = 'http://www.kinohod.ru/api/rest/partner/v1/cities?apikey=%s' % SERVER_API_KEY

    source = ImportSources.objects.get(url='http://kinohod.ru/')

    req = urllib.urlopen(url)
    if req.getcode() == 200:
        kinohod_cities = get_source_data(source, 'city', 'list')

        data_nof_city = ''
        json_data = req.read()
        data = json.loads(json_data)
        for i in data:
            cron_count += 1
            id = str(i['id']).decode('utf-8')

            if id not in kinohod_cities:
                alias = i['alias']
                name = i['name'].encode('utf-8')
                name_slug = del_screen_type(low(del_separator(name)))

                city = City.objects.filter(name__name=name_slug,
                                           name__status=2).distinct('pk')
                if city.count() == 1:
                    SourceCities.objects.create(
                        source_id=id,
                        source_obj=source,
                        city=city[0],
                        name=name,
                        name_alter=alias,
                    )
                    cron_data_new += '%s<br />' % name
                else:
                    data_nof_city += '<city name="%s" slug="%s"></city>' % (
                        name, name_slug)
                    cron_data_nof += '%s<br />' % name
                kinohod_cities.append(id)

        create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                         '<data>%s</data>' % data_nof_city)

    # cron log
    end_time = datetime.datetime.now().strftime('%H:%M:%S')
    cron_data = '%s | %s - %s %s<br />' % (datetime.datetime.now().date(),
                                           start_time, end_time,
                                           'Импорт городов киноход')
    cron_data += '<br /><b>Обработано</b>: %s' % cron_count
    cron_data += '<br /><b>Новые</b>: <br />%s' % cron_data_new
    cron_data += '<br /><b>Ненайденные</b>: <br />%s' % cron_data_nof
    for i in range(50):
        cron_data += '- '
    process_time = time.time() - t1
    cron_data = '<br />* %s сек.<br />%s' % (process_time, cron_data)
    open('%s/cron_log_kinohod_cities.txt' % settings.CRON_LOG_PATH,
         'a').write(cron_data)
    cron_success('json', source.dump, 'cities', 'Города')
Beispiel #26
0
def get_kinomagnat_schedules():
    ignored = get_ignored_films()

    data_nof_film = ''
    data_nof_hall = ''
    data_nof_cinema = ''
    noffilms = []
    nofhalls = []

    city_name = 'Киев'
    cinema_name = 'Магнат'
    city_slug = low(del_separator(city_name))
    cinema_slug = low(del_separator(cinema_name))

    source = ImportSources.objects.get(url='http://www.kinomagnat.com.ua/')
    sfilm_clean(source)

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    schedules = get_source_data(source, 'schedule', 'list')

    halls = get_source_data(source, 'hall', 'dict')

    city = City.objects.get(name__name=city_name, name__status=1)

    try:
        cinema = Cinema.objects.get(name__name=cinema_name, name__status=1, city=city)
    except Cinema.DoesNotExist:
        cinema = None
        data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (cinema_name,  cinema_slug, city_name, city.kid)

    if cinema:
        city_obj, city_created = SourceCities.objects.get_or_create(
            source_id=city_slug,
            source_obj=source,
            defaults={
                'source_id': city_slug,
                'source_obj': source,
                'city': city,
                'name': city_name,
            })

        cinema_obj, cinema_created = SourceCinemas.objects.get_or_create(
            source_id=cinema_slug,
            source_obj=source,
            defaults={
                'source_id': cinema_slug,
                'source_obj': source,
                'city': city_obj,
                'cinema': cinema,
                'name': cinema_name,
            })

        cinema_kid = cinema.code
        city_kid = city.kid

        today = datetime.date.today()

        url = '%sseans.html?device=iphone' % source.url

        req = urllib.urlopen(url)

        if req.getcode() == 200:

            data = BeautifulSoup(req.read())
            div = data.find('div', {'class': 'contentpaneopen'})

            for table in div.findAll('table'):
                try:
                    day, month = table.find_all_previous("p", limit=2)[1].text.strip().split()
                except ValueError:
                    try:
                        day, month = table.find_all_previous("p", limit=3)[2].text.strip().split()
                    except ValueError:
                        day, month = table.find_all_previous("p", limit=4)[3].text.strip().split()

                month = get_month_ua(low(month.encode('utf-8')))
                date_sch = datetime.date(today.year, month, int(day))

                hall_name = table.findAll('tr', limit=1)[0].text.strip().encode('utf-8')
                hall_name_slug = low(del_separator(hall_name))

                if hall_name_slug not in nofhalls:

                    hall_obj = halls.get(hall_name_slug)

                    if not hall_obj:
                        halls_obj = Hall.objects.filter(name__name=hall_name_slug, cinema=cinema_obj.cinema).distinct('pk')
                        if halls_obj.count() == 1:
                            hall_kid = halls_obj[0].kid

                            hall_obj = SourceHalls.objects.create(
                                source_id=hall_name_slug,
                                source_obj=source,
                                cinema=cinema_obj,
                                name=hall_name,
                                kid=hall_kid,
                            )

                            halls[hall_name_slug] = hall_obj
                        else:
                            id = '%s%s%s%s' % (city_kid, cinema_kid, hall_name, hall_name_slug)
                            id = id.replace(' ', '')
                            data_nof_hall += '<hall city="%s" city_kid="%s" cinema="%s" cinema_kid="%s" name="%s" slug="%s" id="%s"></hall>' % (city_name, city_kid, cinema_name, cinema_kid, hall_name, hall_name_slug, id)
                            nofhalls.append(hall_name_slug)

                    if hall_obj:
                        for ind, tr in enumerate(table.findAll('tr')):
                            if ind != 0:
                                showtime, film_data = tr.findAll('td', limit=2)

                                hour, minute = showtime.text.strip().encode('utf-8').split(':')

                                dtime = datetime.datetime(date_sch.year, date_sch.month, date_sch.day, int(hour), int(minute))

                                a = film_data.find('a')
                                film_id = a.get('href').encode('utf-8')
                                full_url = '%s%s' % (source.url, film_id.lstrip('/'))
                                film_name = a.text.strip().encode('utf-8')
                                film_slug = low(del_separator(film_name))

                                if film_id not in noffilms and film_slug.decode('utf-8') not in ignored:
                                    obj = films.get(film_id)
                                    next_step = checking_obj(obj)

                                    if next_step:
                                        if obj:
                                            kid = obj.kid
                                        else:
                                            kid, info = film_identification(film_slug, None, {}, {}, source=source)

                                        objt = None
                                        if kid:
                                            create_new, objt = unique_func(fdict, kid, obj)
                                            if create_new:
                                                new = create_sfilm(film_id, kid, source, film_name)
                                                films[film_id] = new
                                                if not fdict.get(kid):
                                                    fdict[kid] = {'editor_rel': [], 'script_rel': []}
                                                fdict[kid]['script_rel'].append(new)
                                        elif not obj:
                                            data_nof_film += xml_noffilm(film_name, film_slug, None, None, film_id, info, full_url.encode('utf-8'), source.id)
                                            noffilms.append(film_id)

                                        if objt:
                                            sch_id = '%s%s%s' % (dtime, hall_obj.id, film_id)
                                            sch_id = sch_id.replace(' ', '').decode('utf-8')

                                            if sch_id not in schedules:
                                                SourceSchedules.objects.create(
                                                    source_id=sch_id,
                                                    source_obj=source,
                                                    film=objt,
                                                    cinema=cinema_obj,
                                                    hall=hall_obj.kid,
                                                    dtime=dtime,
                                                )
                                                schedules.append(sch_id)

    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_cinema)
    create_dump_file('%s_nof_hall' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_hall)
    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH, '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
Beispiel #27
0
def get_arsenalclub_schedules():
    ignored = get_ignored_films()

    data_nof_film = ''
    noffilms = []

    city_name = 'Нефтекамск'
    cinema_name = 'Арсенал'
    city_slug = low(del_separator(city_name))
    cinema_slug = low(del_separator(cinema_name))

    source = ImportSources.objects.get(url='http://arsenal-club.com/')

    sfilm_clean(source)

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    schedules = get_source_data(source, 'schedule', 'list')

    city = City.objects.get(name__name=city_name, name__status=1)
    cinema = Cinema.objects.get(name__name=cinema_name,
                                name__status=1,
                                city=city)

    city_obj, city_created = SourceCities.objects.get_or_create(
        source_id=city_slug,
        source_obj=source,
        defaults={
            'source_id': city_slug,
            'source_obj': source,
            'city': city,
            'name': city_name,
        })

    cinema_obj, cinema_created = SourceCinemas.objects.get_or_create(
        source_id=cinema_slug,
        source_obj=source,
        defaults={
            'source_id': cinema_slug,
            'source_obj': source,
            'city': city_obj,
            'cinema': cinema,
            'name': cinema_name,
        })

    today = datetime.datetime.now().date()
    future = today + datetime.timedelta(days=6)
    delta = future - today
    for d in range(delta.days + 1):
        date = today + datetime.timedelta(days=d)
        flag = False
        url = '%skino/?rasdel=kino&day=%s#daybox' % (source.url,
                                                     date.strftime('%d.%m'))

        req = urllib.urlopen(url)
        if req.getcode() == 200:
            data = BeautifulSoup(req.read())  #, from_encoding="utf-8"

            for table in data.findAll('table',
                                      width="100%",
                                      cellpadding="3",
                                      cellspacing="1",
                                      bgcolor="#393939"):
                trs = table.findAll('tr', bgcolor="#292929")
                if len(trs) == 0:
                    flag = True
                else:
                    for tr in trs:
                        times, film, price = tr.findAll('td')

                        full_url = film.a.get('href').encode(
                            'utf-8') if film.a and film.a.get('href') else None

                        if full_url:
                            film_name = film.a.text.encode('utf-8').strip()
                        else:
                            film_name = film.text.encode('utf-8').strip()

                        film_slug = del_screen_type(
                            low(del_separator(film_name)))

                        if full_url:
                            film_id = full_url.replace(
                                'http://www.kinopoisk.ru/film/',
                                '').encode('utf-8')
                        else:
                            film_id = film_slug

                        if film_id not in noffilms and film_slug.decode(
                                'utf-8') not in ignored:

                            obj = films.get(film_id)
                            next_step = checking_obj(obj)

                            if next_step:
                                if obj:
                                    kid = obj.kid
                                else:
                                    kid, info = film_identification(
                                        film_slug, None, {}, {}, source=source)

                                objt = None
                                if kid:
                                    create_new, objt = unique_func(
                                        fdict, kid, obj)
                                    if create_new:
                                        objt = create_sfilm(
                                            film_id, kid, source, film_name)
                                        films[film_id] = objt
                                        if not fdict.get(kid):
                                            fdict[kid] = {
                                                'editor_rel': [],
                                                'script_rel': []
                                            }
                                        fdict[kid]['script_rel'].append(objt)
                                elif not obj:
                                    data_nof_film += xml_noffilm(
                                        film_name, film_slug, None, None,
                                        film_id, info, full_url, source.id)
                                    noffilms.append(film_id)

                                if objt:

                                    hours, minutes = times.string.split(':')

                                    dtime = datetime.datetime(
                                        date.year, date.month, date.day,
                                        int(hours), int(minutes))

                                    sch_id = '%s%s%s%s' % (dtime, cinema_slug,
                                                           city_slug, film_id)
                                    sch_id = sch_id.replace(' ',
                                                            '').decode('utf-8')

                                    if sch_id not in schedules:
                                        SourceSchedules.objects.create(
                                            source_id=sch_id,
                                            source_obj=source,
                                            film=objt,
                                            cinema=cinema_obj,
                                            dtime=dtime,
                                        )
                                        schedules.append(sch_id)
        if flag:
            break
    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
Beispiel #28
0
def get_okinoua_distributors(request):
    form = OkinoUploadForm()
    if request.POST:
        form = OkinoUploadForm(request.POST, request.FILES)
        if form.is_valid():
            source = ImportSources.objects.get(url='http://www.okino.ua/')

            with open(
                    '%s/dump_%s_nof_film.xml' %
                (settings.NOF_DUMP_PATH, source.dump), 'r') as f:
                xml_data = BeautifulSoup(f.read(), from_encoding="utf-8")

            ignored = get_ignored_films()

            films_slugs = [i.get('slug_ru') for i in xml_data.findAll('film')]
            today = datetime.date.today()
            films_dict = get_source_data(source, 'film', 'dict')

            releases = SourceReleases.objects.select_related('film').filter(
                film__source_obj=source, release__gte=today)
            releases_dict = {}
            for i in releases:
                releases_dict[i.film.source_id] = i

            data_nof_films = ''

            data = request.FILES['file'].read()
            html_data = BeautifulSoup(data, from_encoding="utf-8")

            main = html_data.find('div', {'class': 'release_list'})

            year = datetime.date.today().year

            first_h3 = main.findAll('h3', limit=1)[0]
            for div in first_h3.find_next_siblings():
                film_tag = div.find('p', {'class': 'name'})
                flag = False
                if film_tag:
                    flag = True
                    film_tag = film_tag.a
                    film_name = film_tag.string.encode('utf-8')
                    full_url = film_tag.get('href').encode('utf-8')
                    film_id = re.findall(r'\d+\/$', full_url)[0].replace(
                        '/', '').encode('utf-8')
                    film_slug = low(del_separator(film_name))
                    film_year = div.find('span', {
                        'class': 'y'
                    }).string.encode('utf-8').replace('(',
                                                      '').replace(')', '')
                    full_url = 'http://www.okino.ua%s' % full_url

                    release_day = int(
                        div.find('span', {
                            'class': 'day'
                        }).string)
                    release_month = div.find('span', {
                        'class': 'month'
                    }).string.encode('utf-8')
                    release_month = get_month(release_month)

                    release_date = datetime.date(year, int(release_month),
                                                 release_day)

                    film_obj = films_dict.get(film_id)
                    if not film_obj:
                        kid, info = film_identification(film_slug,
                                                        None, {}, {},
                                                        year=film_year,
                                                        source=source)
                        if kid:
                            film_obj = SourceFilms.objects.create(
                                source_id=film_id,
                                source_obj=source,
                                name=film_name,
                                kid=kid,
                                year=film_year,
                            )
                        else:
                            temp_film_slug = film_slug.decode('utf-8')
                            if temp_film_slug not in films_slugs and temp_film_slug not in ignored:
                                films_slugs.append(film_slug.decode('utf-8'))
                                data_nof_films += xml_noffilm(
                                    film_name, film_slug, None, None, film_id,
                                    info, full_url.encode('utf-8'), source.id)
                        if film_obj:
                            for p in div.findAll('p'):
                                if p.string:
                                    text = p.string.encode('utf-8')
                                    if 'Дистрибьютор:' in text:
                                        distr = text.replace(
                                            'Дистрибьютор: ',
                                            '').decode('utf-8')

                                        release_obj = releases_dict.get(
                                            film_id)
                                        if release_obj:
                                            if release_obj.release != release_date or release_obj.distributor != distr:
                                                release_obj.release = release_date
                                                release_obj.distributor = distr
                                                release_obj.save()
                                        else:
                                            release_obj = SourceReleases.objects.create(
                                                source_obj=source,
                                                film=film_obj,
                                                release=release_date,
                                                distributor=distr,
                                            )
                                            releases_dict[
                                                film_id] = release_obj
                if div.string:
                    year = int(
                        re.findall(r'\d+$', div.string.encode('utf-8'))[0])

            xml_data = str(xml_data).replace('<html><head></head><body><data>',
                                             '').replace(
                                                 '</data></body></html>', '')
            xml_data = '<data>%s%s</data>' % (xml_data, data_nof_films)

            create_dump_file('%s_nof_film' % source.dump,
                             settings.NOF_DUMP_PATH, xml_data)
            return HttpResponseRedirect(reverse('admin_source_releases_show'))

    return render_to_response('release_parser/okinoua_upload.html',
                              {'form': form},
                              context_instance=RequestContext(request))
Beispiel #29
0
def get_zapad24ru():
    ignored = get_ignored_films()
    ignored_cinemas = get_ignored_cinemas()

    source = ImportSources.objects.get(url='http://zapad24.ru/')
    sfilm_clean(source)

    cities_dict = get_source_data(source, 'city', 'dict')
    cinemas_dict = get_source_data(source, 'cinema', 'dict')
    schedules = get_source_data(source, 'schedule', 'list')

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    today = datetime.datetime.now()
    next_month = datetime.date.today() + datetime.timedelta(days=40)

    data_nof_films = ''
    data_nof_cinema = ''
    data_nof_city = ''
    noffilms = []

    req = urllib.urlopen('%safisha/' % source.url)
    if req.getcode() == 200:
        data = BeautifulSoup(req.read())  #, from_encoding="utf-8"
        div = data.find('div', align="left")
        for ind, table in enumerate(
                div.findAll('table',
                            border="0",
                            cellpadding="0",
                            cellspacing="0",
                            width="100%")):
            cinema_tag = table.find('strong').string.encode('utf-8')
            cinema_name = re.findall(r'\".+\"',
                                     cinema_tag)[0].replace('"', '').strip()
            cinema_slug = low(del_separator(cinema_name))
            cinema_id = cinema_slug.decode('utf-8')

            city_name = re.findall(r'\(.+\)', cinema_tag)[0].replace(
                '(г. ', '').replace(')', '').strip()
            city_slug = low(del_separator(city_name))
            city_id = city_slug.decode('utf-8')

            city_obj = cities_dict.get(city_id)

            if not city_obj:
                city = City.objects.filter(name__name=city_slug,
                                           name__status=2).distinct('pk')
                if city.count() == 1:
                    city_obj = SourceCities.objects.create(
                        source_id=city_id,
                        source_obj=source,
                        city=city[0],
                        name=city_name,
                    )
                    cities_dict[city_id] = city_obj
                else:
                    if 'slug="%s"' % city_slug not in data_nof_city:
                        data_nof_city += '<city name="%s" slug="%s"></city>' % (
                            city_name, city_slug)

            if city_obj:
                cinema_ig_id = u'%s__%s' % (cinema_slug.decode('utf-8'),
                                            city_obj.city.kid)

                if cinema_ig_id not in ignored_cinemas:
                    cinema_obj = cinemas_dict.get(cinema_id)
                    if not cinema_obj:
                        filter1 = {
                            'name__name': cinema_slug,
                            'name__status': 2,
                            'city': city_obj.city
                        }
                        cinema_kid = cinema_identification(
                            cinema_slug, filter1)
                        if cinema_kid:
                            try:
                                cinema = Cinema.objects.get(code=cinema_kid)
                                cinema_obj = SourceCinemas.objects.create(
                                    source_id=cinema_id,
                                    source_obj=source,
                                    city=city_obj,
                                    cinema=cinema,
                                    name=cinema_name,
                                )
                                cinemas_dict[cinema_id] = cinema_obj
                            except Cinema.DoesNotExist:
                                pass
                        else:
                            if 'slug="%s"' % cinema_slug not in data_nof_cinema:
                                data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                                    cinema_name, cinema_slug, city_name,
                                    city_obj.city.kid)

                    if cinema_obj:
                        film_table = table.find('table')
                        date_from = None
                        date_to = None
                        for tr in film_table.findAll('tr'):
                            film_name, film_slug, film_id = (None, None, None)
                            if ind == 0:
                                film_name = tr.find('b').string.encode(
                                    'utf-8').strip()
                                film_slug = low(del_separator(film_name))
                                film_id = film_slug.decode('utf-8')
                            else:
                                showdate = ''
                                for f in tr.findAll('b'):
                                    if f.find('span'):
                                        showdate = f.find(
                                            'span').string.encode(
                                                'utf-8').strip()
                                    else:
                                        film_name = f.string.encode(
                                            'utf-8').strip()
                                        film_name = re.findall(
                                            r'\«.+\»', film_name)[0]
                                        film_name = film_name.replace(
                                            '«', '').replace('»', '').strip()
                                        film_slug = low(
                                            del_separator(film_name))
                                        film_id = film_slug.decode('utf-8')

                                if showdate and film_name:
                                    try:
                                        date_from, date_to = showdate.split(
                                            '-')
                                        date_from_day, date_from_month = date_from.strip(
                                        ).split('.')
                                        date_to_day, date_to_month = date_to.strip(
                                        ).split('.')
                                    except ValueError:
                                        date_from, date_to = showdate.split(
                                            ' – ')
                                        date_from_day, date_from_month = date_from.strip(
                                        ).split()
                                        date_from_month = get_month(
                                            date_from_month)
                                        date_to_day, date_to_month = date_to.strip(
                                        ).split()
                                        date_to_month = get_month(
                                            date_to_month)

                                    date_from = datetime.date(
                                        today.year, int(date_from_month),
                                        int(date_from_day))
                                    date_to = datetime.date(
                                        today.year, int(date_to_month),
                                        int(date_to_day))

                            full_url = tr.find('a').get('href').encode('utf-8')

                            if film_id not in noffilms and film_id not in ignored:
                                obj = films.get(film_id)
                                next_step = checking_obj(obj)

                                if next_step:
                                    if obj:
                                        kid = obj.kid
                                    else:
                                        kid, info = film_identification(
                                            film_slug,
                                            None, {}, {},
                                            source=source)

                                    objt = None
                                    if kid:
                                        create_new, objt = unique_func(
                                            fdict, kid, obj)
                                        if create_new:
                                            objt = create_sfilm(
                                                film_id, kid, source,
                                                film_name)
                                            films[film_id] = objt
                                            if not fdict.get(kid):
                                                fdict[kid] = {
                                                    'editor_rel': [],
                                                    'script_rel': []
                                                }
                                            fdict[kid]['script_rel'].append(
                                                objt)
                                    elif not obj:
                                        data_nof_film += xml_noffilm(
                                            film_name, film_slug, None, None,
                                            film_id.encode('utf-8'), info,
                                            full_url, source.id)
                                        noffilms.append(film_id)

                                    if objt:
                                        req_film = urllib.urlopen(full_url)
                                        if req_film.getcode() == 200:
                                            data_film = BeautifulSoup(
                                                req_film.read()
                                            )  #, from_encoding="utf-8"

                                            td = data_film.find(
                                                'td', {
                                                    'class': 'news'
                                                }).div.text.encode('utf-8')

                                            showtime = []

                                            if ind == 0:
                                                showtime = re.findall(
                                                    r'\d+\:\d+\s\s?', td)
                                            else:
                                                if date_from and date_to:
                                                    if date_to < next_month:
                                                        showtimes = re.findall(
                                                            r'Начало сеансов:\s?[\d+\-\d+\,?\s?]+',
                                                            td)
                                                        times = []
                                                        for t in showtimes:
                                                            t = t.replace(
                                                                'Начало сеансов:',
                                                                '').split(',')
                                                            times = [
                                                                i.strip()
                                                                for i in t
                                                                if i.strip()
                                                            ]

                                                        delta = date_to - date_from
                                                        for day in range(
                                                                delta.days +
                                                                1):
                                                            d = date_from + datetime.timedelta(
                                                                days=day)
                                                            for t in times:
                                                                hours, minutes = t.split(
                                                                    '-')
                                                                dtime = datetime.datetime(
                                                                    d.year,
                                                                    d.month,
                                                                    d.day,
                                                                    int(hours),
                                                                    int(minutes
                                                                        ))
                                                                showtime.append(
                                                                    dtime)

                                            for t in showtime:
                                                if ind == 0:
                                                    hours, minutes = t.strip(
                                                    ).split(':')
                                                    dtime = datetime.datetime(
                                                        today.year,
                                                        today.month, today.day,
                                                        int(hours),
                                                        int(minutes))
                                                else:
                                                    dtime = t

                                                sch_id = '%s%s%s%s' % (
                                                    dtime, cinema_slug,
                                                    city_slug,
                                                    film_id.encode('utf-8'))
                                                sch_id = sch_id.replace(
                                                    ' ', '').decode('utf-8')

                                                if sch_id not in schedules:
                                                    SourceSchedules.objects.create(
                                                        source_id=sch_id,
                                                        source_obj=source,
                                                        film=objt,
                                                        cinema=cinema_obj,
                                                        dtime=dtime,
                                                    )
                                                    schedules.append(sch_id)

    create_dump_file('%s_nof_city' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_city)
    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)
    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_films)
    cron_success('html', source.dump, 'schedules', 'Сеансы')
Beispiel #30
0
def get_cinema5_schedules():
    data_nof_cinema = ''
    data_nof_film = ''
    noffilms = []

    ignored = get_ignored_films()

    source = ImportSources.objects.get(url='http://cinema5.ru/')
    sfilm_clean(source)

    films = {}
    source_films = SourceFilms.objects.filter(source_obj=source)
    for i in source_films:
        films[i.source_id] = i
    fdict = get_all_source_films(source, source_films)

    schedules = get_source_data(source, 'schedule', 'list')

    data = [
        {
            'city': 'Нижнекамск',
            'url': '%snk' % source.url
        },
        {
            'city': 'Оренбург',
            'url': '%soren' % source.url
        },
        {
            'city': 'Саратов',
            'url': '%ssaratov' % source.url
        },
        {
            'city': 'Уфа',
            'url': '%sufa' % source.url
        },
        {
            'city': 'Чебоксары',
            'url': '%scheby' % source.url
        },
    ]

    params = ['today', 'tomorrow', '+2days']

    cinema_name = 'Синема 5'
    cinema_slug = low(del_separator(cinema_name))

    for i in data:
        city_slug = low(del_separator(i['city']))
        city = City.objects.get(name__name=i['city'], name__status=1)

        city_obj, city_created = SourceCities.objects.get_or_create(
            source_id=city_slug,
            source_obj=source,
            defaults={
                'source_id': city_slug,
                'source_obj': source,
                'city': city,
                'name': i['city'],
            })

        cinema = None

        try:
            cinema = Cinema.objects.get(name__name=cinema_name,
                                        name__status=1,
                                        city=city)
        except Cinema.DoesNotExist:
            data_nof_cinema += '<cinema name="%s" slug="%s" city="%s" city_kid="%s"></cinema>' % (
                cinema_name, cinema_slug, i['city'], city_obj.city.kid)

        if cinema:
            cinema_id = '%s_%s' % (cinema_slug, city_slug)

            cinema_obj, cinema_created = SourceCinemas.objects.get_or_create(
                source_id=cinema_id,
                source_obj=source,
                defaults={
                    'source_id': cinema_id,
                    'source_obj': source,
                    'city': city_obj,
                    'cinema': cinema,
                    'name': cinema_name,
                })

            for param in params:
                url = '%s?date=%s' % (i['url'], param)
                req = urllib.urlopen(url)
                if req.getcode() == 200:
                    page_data = BeautifulSoup(req.read())
                    divs = page_data.find('div', {'class': 'content clearfix'})

                    showdate = divs.find('h1')
                    if showdate:
                        showdate = showdate.string.encode('utf-8')
                        day, month, year = showdate.replace(
                            'Расписание на ', '').strip().split('.')

                        for div in divs.findAll('div',
                                                {'class': 'show-wrapper'}):
                            film_name = div.find('div', {
                                'class': 'title'
                            }).string.encode('utf-8')
                            film_slug = low(
                                del_separator(del_screen_type(film_name)))
                            film_id = film_slug

                            if film_id not in noffilms and film_slug.decode(
                                    'utf-8') not in ignored:

                                obj = films.get(film_id.decode('utf-8'))
                                next_step = checking_obj(obj)

                                if next_step:
                                    if obj:
                                        kid = obj.kid
                                    else:
                                        kid, info = film_identification(
                                            film_slug,
                                            None, {}, {},
                                            source=source)

                                    objt = None
                                    if kid:
                                        create_new, objt = unique_func(
                                            fdict, kid, obj)
                                        if create_new:
                                            objt = create_sfilm(
                                                film_id, kid, source,
                                                film_name)
                                            films[film_id.decode(
                                                'utf-8')] = objt
                                            if not fdict.get(kid):
                                                fdict[kid] = {
                                                    'editor_rel': [],
                                                    'script_rel': []
                                                }
                                            fdict[kid]['script_rel'].append(
                                                objt)
                                    elif not obj:
                                        data_nof_film += xml_noffilm(
                                            film_name, film_slug, None, None,
                                            film_id, info, None, source.id)
                                        noffilms.append(film_id)

                                    if objt:
                                        for span in div.findAll(
                                                'span', {'class': 'time'}):
                                            hours, minutes = span.string.strip(
                                            ).split(':')
                                            dtime = datetime.datetime(
                                                int(year), int(month),
                                                int(day), int(hours),
                                                int(minutes))

                                            sch_id = '%s%s%s%s' % (
                                                dtime, cinema_id, city_slug,
                                                film_id)
                                            sch_id = sch_id.replace(
                                                ' ', '').decode('utf-8')

                                            if sch_id not in schedules:
                                                SourceSchedules.objects.create(
                                                    source_id=sch_id,
                                                    source_obj=source,
                                                    film=objt,
                                                    cinema=cinema_obj,
                                                    dtime=dtime,
                                                )
                                                schedules.append(sch_id)

    create_dump_file('%s_nof_cinema' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_cinema)
    create_dump_file('%s_nof_film' % source.dump, settings.NOF_DUMP_PATH,
                     '<data>%s</data>' % data_nof_film)
    cron_success('html', source.dump, 'schedules', 'Сеансы')