Python simple_tor_get_page Examples, crawler.tor.simple_tor_get_page Python Examples

Example #1

0

Show file

File: loader.py Project: tumani1/vsevi

 def load(self):
     url_from_name = form_url_from_name(self.film.name_orig)
     
     if not url_from_name is None:
         reqobj = simple_tor_get_page(url_from_name)
         html_with_type = HTML_with_type(reqobj.decode('utf-8'))
         len(html_with_type)
         html_with_type.page_type='film_page'
         return {'html':html_with_type, 'url_view':url_from_name, 'url':url_from_name}
     else:
         html_with_type =HTML_with_type(simple_tor_get_page(form_search_url(self.film.name)))
         html_with_type.page_type = 'search_page'
         return {'html':html_with_type, 'url_view':url_from_name, 'url':url_from_name}

Example #2

0

Show file

File: loader.py Project: tumani1/vsevi

    def get_film_data(self):
        search_film_url = '/hbo/api/v1/films.json?'
        site_name = 'www.amediateka.ru'
        filter_film_search = 'limit=1000&offset=0&expand=genres&client_id=amediateka&platform=desktop'
        url = "http://{}{}{}".format(site_name, search_film_url, filter_film_search)
        response = simple_tor_get_page(url)
        data_site = json.loads(response)['films']
        film = Films.objects.values_list('id', 'name')
        data = film.values('name', 'id')
        locations = {
        'info': [],
        'type': 'amediateka_ru'
                }
        for f in data_site:
            for film in data:
                if f['name'] == film['name']:
                    film_data = Films.objects.filter(id=film['id'])
                    for dict_film in film_data:
                        d = self.film_dict(dict_film, f)
                        one_loc_res = save_location(**d)
                        save_existed_location_to_locs_dict(locations, one_loc_res)
                    break

        fill_log_table_for_not_schema_corresponded_robots(locations)
        robot_is_banned = MultiLocationRobotsBunCheck.is_result_looks_like_robot_banned(locations)
        if not robot_is_banned:
            LocationRobotsCorrector.correct_locations(locations, 'amediateka')
        return locations

Example #3

0

Show file

def update_kinopoisk_persone(pid):
    try:
        response = simple_tor_get_page('http://www.kinopoisk.ru/name/{}/view_info/ok/#trivia'.format(pid), True)
        soup = BeautifulSoup(response)
        tag = soup.find('span', attrs={'itemprop': 'alternativeHeadline'})
        orig_name = tag.text.strip()
        p = Persons.objects.get(kinopoisk_id=pid)
        tag_birthdate = soup.find('td', attrs={'class': 'birth'})
        birthdate = ''
        print "ID = ", p.id
        if not (tag_birthdate is None):
            birthdate = tag_birthdate.get('birthdate')
        else:
            print 'No data birthdate for this person id = {}'.format(pid)
        tags_bio = soup.findAll('li', attrs={'class': 'trivia'})
        bio = ''
        if len(tags_bio):
            for li in tags_bio:
                bio = bio + ' ' + li.text
        else:
            print 'No biography for this person id = {}'.format(pid)
        p.name_orig = orig_name
        p.birthdate = birthdate
        p.bio = bio
        p.kinopoisk_id = pid
        if p.photo == '' and p.kinopoisk_id != 0:
            p.photo.save('profile.jpg', File(get_photo(p.kinopoisk_id)))
        p.save()
    except Exception, e:
        import traceback
        traceback.print_exc()

Example #4

0

Show file

File: parsers.py Project: andrewFisherUa/videobase

    def parse(self, response, dict_gen, film, url):
        d = dict_gen(film)
        content = simple_tor_get_page(url)
        value = ''
        isFilm = False
        try:
            soup = BeautifulSoup(content)
            tag = soup.find('meta', attrs={'property': 'og:type'})
            if (not tag is None and film.type == APP_FILM_FULL_FILM)\
                    or (film.type == APP_FILM_SERIAL and tag is None):
                tag = soup.find('div', attrs={'class': 'big_rating'})
                id = tag.get('id')
                value = self.parse_value + id
                isFilm = True
        except:
            pass

        if isFilm:
            d['url_view'] = url
            d['value'] = value
            d['price_type'] = 0
            d['price'] = self.get_price()
            d['type'] = 'zoomby'
            return [d]

        return []

Example #5

0

Show file

File: parsers.py Project: tumani1/vsevi

    def parse(self, response, dict_gen, film, url):
        d = dict_gen(film)
        content = simple_tor_get_page(url)
        value = ''
        isFilm = False
        try:
            soup = BeautifulSoup(content)
            tag = soup.find('meta', attrs={'property': 'og:type'})
            if (not tag is None and film.type == APP_FILM_FULL_FILM)\
                    or (film.type == APP_FILM_SERIAL and tag is None):
                tag = soup.find('div', attrs={'class': 'big_rating'})
                id = tag.get('id')
                value = self.parse_value + id
                isFilm = True
        except:
            pass

        if isFilm:
            d['url_view'] = url
            d['value'] = value
            d['price_type'] = 0
            d['price'] = self.get_price()
            d['type'] = 'zoomby'
            return [d]

        return []

Example #6

0

Show file

File: mail_robot.py Project: tumani1/vsevi

    def get_film_data(self):

        url = 'http://my.mail.ru/video/catalog/movies'

        content = simple_tor_get_page(url)
        soup = BeautifulSoup(content)
        locations = {
                'info': [],
                'type': 'mail_ru'
        }
        items = soup.findAll('a', {'class': 'link-default'})

        for item in items:
            film_name = item.text
            film_link = item.get('href')
            film_url = HOST + film_link
            film_dict = self.get_film_dict(film_name)

            if not film_dict is None:
                film_dict['type'] = 'mail_ru'
                film_dict['url_view'] = film_url
                film_dict['price'] = 0
                film_dict['price_type'] = APP_CONTENTS_PRICE_TYPE_FREE

                one_loc_res = save_location(**film_dict)
                save_existed_location_to_locs_dict(locations, one_loc_res)
        fill_log_table_for_not_schema_corresponded_robots(locations)
        return locations

Example #7

0

Show file

def parse_search(response, film_name, year):
    film_link = None
    try:
        soup = BeautifulSoup(response)
        if not (soup.find('div', {'class': 'empty-search'}) is None):
            return None
        search_div = soup.find('div', {'class': 'card-list'})
        film_divs = search_div.find_all(
            'div',
            {'class': ['card', 'no-rationale', 'tall-cover', 'movies tiny']})
        for film in film_divs:
            film_tag = film.find('a', {'class': 'title'})
            if film_name == film_tag.get('title'):
                film_link = 'http://play.google.com' + film_tag.get('href')
        if film_link:
            page = simple_tor_get_page(film_link)
            soup_page = BeautifulSoup(page)
            film_year = soup_page.find('div', {
                'itemprop': 'datePublished'
            }).text
            film_year = re.search(ur'\d+', film_year)
            if not str(year) in film_year.group():
                return None
    except IndexError:
        film_link = None
    return film_link

Example #8

0

Show file

File: robot.py Project: tumani1/vsevi

    def get_data(self):
        all_film_url = 'http://viaplay.ru/filmy/vse/5/alphabetical'
        locations = {
            'info': [],
            'type': 'viaplay'
                }
        content = simple_tor_get_page(all_film_url)
        soup_films = BeautifulSoup(content).find('ul', {'class': 'atoz-list'}).li.ul.find_all('li')
        films = Films.objects.values('name', 'id')
        for li_film in soup_films:
            for film in films:
                if li_film.a.text.lower().strip().encode('utf-8').translate(None, string.punctuation) == film['name'].lower().strip().encode('utf-8').translate(None, string.punctuation):
                    link = 'http://viaplay.ru' + li_film.a.get('href')
                    film_query_set = Films.objects.filter(id=film['id'])
                    for obj in film_query_set:
                        d = self.film_dict(obj, link)

                        one_loc_res = save_location(**d)
                        save_existed_location_to_locs_dict(locations, one_loc_res)
                    break
        fill_log_table_for_not_schema_corresponded_robots(locations)
        robot_is_banned = MultiLocationRobotsBunCheck.is_result_looks_like_robot_banned(locations)
        if not robot_is_banned:
            LocationRobotsCorrector.correct_locations(locations, 'viaplay')
        return locations

Example #9

0

Show file

File: playfamily_xml.py Project: andrewFisherUa/videobase

def get_soup():
    '''
    Getting data from playfamily site and parsing its xml into BeautfulSoup object
    
    '''
    xmldata = simple_tor_get_page(PLAYFAMILY_XML)
    soup = BeautifulSoup(xmldata,'xml')
    return soup

Example #10

0

Show file

def get_image(template, actor_id):
    try:
        r = simple_tor_get_page(template.format(actor_id), tor_flag=False)
        return convert_file(r)

    except Exception as e:
        import traceback
        traceback.print_exc()
        return None

Example #11

0

Show file

File: translation_live_russia_tv.py Project: tumani1/vsevi

def parse_translation_live_russia_tv():
    translation_list = []

    # Get current date
    current_date = timezone.now()

    # Get page
    translation_page = simple_tor_get_page(TRANSLATION_URL)
    soup = BeautifulSoup(translation_page)

    translation_bloc = soup.find('div', {'class': ['broadcasts', 'tab-non-active', 'tab-broadcasts']})

    for trans in translation_bloc.find_all('li'):
        try:

            # Get date and time
            time_tag = trans.find('div', {'class': 'time'})
            time = re.findall(ur'\d+', time_tag.text)
            date_tag = trans.find('div', {'class': 'label'})
            date_str = date_tag.text.lower().strip().split()

            if len(date_str) == 1:
                if date_str[0] == u'сегодня':
                    date = timezone.datetime(year=current_date.year, month=current_date.month, day=current_date.day,
                                         hour=int(time[0]), minute=int(time[1]))
            elif len(date_str) == 3:
                date = timezone.datetime(year=int(date_str[2]), month=MONTHS[date_str[1]], day=int(date_str[0]),
                                         hour=int(time[0]), minute=int(time[1]))
            else:
                continue

            # Get title
            title_tag = trans.h2
            title = title_tag.text

            #Get link
            href = trans.a.get('href')
            link = TRANSLATION_URL + href

            #Get video id
            video_id = href.split('/')[-1]

            #Create dict with information about translation
            translation_data = {
                'title': title,
                'date': date,
                'price': float(0),
                'link': link,
                'meta': {},
                'embed_code': None,
                'value': video_id,
                'player': PLAYER_LINK % video_id
            }

            translation_list.append(translation_data)
        except Exception, e:
            print e.message

Example #12

0

Show file

File: robot.py Project: andrewFisherUa/videobase

 def __init__(self, film_id):
     self.film = Films.objects.get(id=film_id)
     search_film = urllib.urlencode(
         {'text': (self.film.name.encode('utf-8'))})
     search_url = URL_SEARCH.format(search_film)
     url = "https://%s/%s" % (
         HOST,
         search_url,
     )
     self.response = simple_tor_get_page(url)

Example #13

0

Show file

def kinopoisk_news():
    data = BeautifulSoup(
        simple_tor_get_page(KINOPOISK_PREMIERES_URL).content.decode('cp1251'))

    big_names = data.select('span.name_big')
    names = data.select('span.name')

    for name in big_names + names:
        if name.a:
            kinopoisk_id = int(name.a.attrs['href'].split('/')[2])
            kinopoisk_name = name.a.text
            yield kinopoisk_name, kinopoisk_id

Example #14

0

Show file

File: parsers.py Project: tumani1/vsevi

def parse_search(response, film_name, film_year):
    film_link = None
    try:
        flag = False
        soup = BeautifulSoup(response)
        if soup.find(attrs={'data-href': 'video_search'}) is None:
            return None
        class_tag = soup.find('aside', {'role': 'complementary'})

        if class_tag:
            li_list = class_tag.find_all('li')
            for li in li_list:
                if u'Видео' in li.text:
                    flag = True
                    break
        if flag:
            search_divs = soup.find_all('div', {'class': 'catalog-list search'})
            film_div = None
            for div in search_divs:
                if div.figure:
                    film_div = div
                    break
            if film_div:
                films = film_div.find_all('figure')
                for film in films:
                    if film_name.lower().strip().encode('utf-8').translate(None, string.punctuation) == film.figcaption.div.header.strong.text.lower().strip().encode('utf-8').translate(None, string.punctuation):
                        film_link = 'http://www.zabava.ru' + film.a.get('href')
                        break
            else:
                return None
        else:
            return None
        if film_link:
            film_page = simple_tor_get_page(film_link)
            film_soup = BeautifulSoup(film_page)
            year_bloc = film_soup.find('div', {'class': 'mbottom10'})
            reg = re.compile(ur'Год издания')
            year_tag = None
            for e in year_bloc.find_all('em'):
                if reg.match(e.text):
                    year_tag = e.parent
                    break
            if year_tag:
                year = re.search(ur'\d+', year_tag.text)
                year = int(year.group())
                if film_year.year != year:
                    film_link = None
            else:
                return None

    except:
        film_link = None
    return film_link

Example #15

0

Show file

File: kinopoisk_premiere.py Project: tumani1/vsevi

def kinopoisk_news():
    data = BeautifulSoup(simple_tor_get_page(KINOPOISK_PREMIERES_URL).content.decode('cp1251'))

    big_names = data.select('span.name_big')
    names = data.select('span.name')

    for name in big_names + names:
        if name.a:
            kinopoisk_id = int(name.a.attrs['href'].split('/')[2])
            kinopoisk_name = name.a.text

            yield kinopoisk_name, kinopoisk_id

Example #16

0

Show file

    def load(self):
        url_from_name = form_url_from_name(self.film.name_orig)

        if not url_from_name is None:
            reqobj = simple_tor_get_page(url_from_name)
            html_with_type = HTML_with_type(reqobj.decode('utf-8'))
            len(html_with_type)
            html_with_type.page_type = 'film_page'
            return {
                'html': html_with_type,
                'url_view': url_from_name,
                'url': url_from_name
            }
        else:
            html_with_type = HTML_with_type(
                simple_tor_get_page(form_search_url(self.film.name)))
            html_with_type.page_type = 'search_page'
            return {
                'html': html_with_type,
                'url_view': url_from_name,
                'url': url_from_name
            }

Example #17

0

Show file

File: parse_page.py Project: tumani1/vsevi

def get_vote(soup):
    csslink = [lnk.attrs['href'] for lnk in soup.find_all('link') if 'votes' in lnk.attrs['href']][0]
    # TODO implement caching

    css = simple_tor_get_page(csslink, tor_flag=True)

    m = re.search('[.]starbar[ ]{width[:][ ](?P<width>[0-9]+)px', css)
    parent_width = float(m.groupdict()['width'])

    starbar_div = soup.select('div.starbar_w')
    child_width = float(dict([i.split(':') for i in starbar_div[0].attrs['style'].split(';')])['width'].replace('px', ''))

    return round(child_width / (parent_width * 10), 2)

Example #18

0

Show file

File: robot.py Project: andrewFisherUa/videobase

 def get_data(self):
     all_film_url = 'http://viaplay.ru/filmy/vse/5/alphabetical'
     content = simple_tor_get_page(all_film_url)
     soup_films = BeautifulSoup(content).find('ul', {
         'class': 'atoz-list'
     }).li.ul.find_all('li')
     films = Films.objects.values('name', 'id')
     for li_film in soup_films:
         for film in films:
             if li_film.a.text == film['name']:
                 link = 'http://viaplay.ru' + li_film.a.get('href')
                 film_query_set = Films.objects.filter(id=film['id'])
                 for obj in film_query_set:
                     d = self.film_dict(obj, link)
                     save_location(**d)
                 break

Example #19

0

Show file

File: now_or_stream_news.py Project: tumani1/vsevi

def parse_news(robot_name):

    #Get site
    site = SITE_DICT[robot_name]

    # Get page
    page = simple_tor_get_page(site['news_url'])
    soup = BeautifulSoup(page)

    # Constant for punctuation
    punctuation = string.punctuation
    trans = string.maketrans(punctuation, ' ' * len(punctuation))

    # String constant
    pattern = '!"#$%&\'()*+,-./:;<=>–№«»?@[\\]^_`{|}~ ' # ascii: 160

    cursor = connection.cursor()

    #Get films from page
    films = soup.find_all(*site['films_tag_args'])

    news = []
    for film in films:
        try:
            # Find name from tag
            name_tag = film.find(*site['name_tag_args'])
            name = name_tag.text.lower().strip().encode('utf-8').translate(trans)

            # Find year from tag
            year_tag = film.find(*site['year_tag_args'])
            year = int(re.search(ur'\d+', year_tag.text).group())

            query = """SELECT * FROM (SELECT films.id, films.name, EXTRACT(YEAR FROM films.release_date) AS year,
                regexp_split_to_array(trim(both lower(translate(films.name, E%s, %s))), E'\\s+') AS new_name
                FROM films) AS t WHERE t.year=%s AND t.new_name=regexp_split_to_array(%s, E'\\s+')"""

            cursor.execute(query, [pattern, ' ' * len(pattern), year, name])
            result = dict_fetch_all_without_gen(cursor)

            if len(result) == 1:
                link_tag = name_tag if robot_name == ROBOT_STREAM else name_tag.a

                link = site['host'] + link_tag.get('href')
                news.append({'film_id': result[0]['id'], 'url': link})

        except Exception, e:
            print e.message

Example #20

0

Show file

File: loader.py Project: andrewFisherUa/videobase

 def get_film_data(self):
     search_film_url = '/hbo/api/v1/films.json?'
     filter_film_search = 'limit=1000&offset=0&expand=genres&client_id=amediateka&platform=desktop'
     url = "http://{}{}{}".format('www.amediateka.ru', search_film_url,
                                  filter_film_search)
     response = simple_tor_get_page(url)
     data_site = json.loads(response)['films']
     film = Films.objects.values_list('id', 'name')
     data = film.values('name', 'id')
     for f in data_site:
         for film in data:
             if f['name'] == film['name']:
                 film_data = Films.objects.filter(id=film['id'])
                 for dict_film in film_data:
                     d = self.film_dict(dict_film, f)
                     save_location(**d)
                 break

Example #21

0

Show file

File: loader.py Project: tumani1/vsevi

 def get_serials_data(self):
     search_serials_url = '/hbo/api/v1/serials.json?'
     filter_serials_search = 'limit=1000&offset=0&expand=seasons,genres&client_id=amediateka&platform=desktop'
     url = "http://{}{}{}".format('www.amediateka.ru', search_serials_url, filter_serials_search)
     response = simple_tor_get_page(url)
     data_site = json.loads(response)['serials']
     data = Films.objects.values_list('id', 'name').values('name', 'id')
     for s in data_site:
         for serials in data:
             if s['name'] == serials['name']:
                 serials_data = Films.objects.filter(id=serials['id'])
                 for dict_serials in serials_data:
                     if dict_serials.type == APP_FILM_SERIAL:
                         list_serial = self.serial_dict(dict_serials, s)
                         for ser in list_serial:
                             save_location(**ser)
                 break

Example #22

0

Show file

File: translation_championat.py Project: tumani1/vsevi

def parse_translation_championat_com():
    translation_list = []
    champ_dict = {}

    # Get page
    list_translations_page = simple_tor_get_page(TRANSLATION_URL+'/broadcast/')
    list_translations_soup = BeautifulSoup(list_translations_page)

    championship_bloc = list_translations_soup.find('div', {'class': 'broadcast__menu'})

    #Create championship map
    for champ in championship_bloc.find_all('div', {'class': 'broadcast__menu__i'}):
        try:
            img = champ.find('div', {'class': 'broadcast__tournament'})
            if img:
                champ_dict[img.get('class')[-1]] = champ.p.text
        except Exception, e:
            print e.message

Example #23

0

Show file

File: loader.py Project: andrewFisherUa/videobase

 def get_serials_data(self):
     search_serials_url = '/hbo/api/v1/serials.json?'
     filter_serials_search = 'limit=1000&offset=0&expand=seasons,genres&client_id=amediateka&platform=desktop'
     url = "http://{}{}{}".format('www.amediateka.ru', search_serials_url,
                                  filter_serials_search)
     response = simple_tor_get_page(url)
     data_site = json.loads(response)['serials']
     data = Films.objects.values_list('id', 'name').values('name', 'id')
     for s in data_site:
         for serials in data:
             if s['name'] == serials['name']:
                 serials_data = Films.objects.filter(id=serials['id'])
                 for dict_serials in serials_data:
                     if dict_serials.type == APP_FILM_SERIAL:
                         list_serial = self.serial_dict(dict_serials, s)
                         for ser in list_serial:
                             save_location(**ser)
                 break

Example #24

0

Show file

File: parse_actors.py Project: andrewFisherUa/videobase

    def acquire_page(page_id):
        if not os.path.exists(PAGE_ARCHIVE):
            os.mkdir(PAGE_ARCHIVE)

        dump_path = os.path.join(PAGE_ARCHIVE, str(page_id))
        page_dump = ''
        if os.path.exists(dump_path):
            with open(dump_path) as fd:
                page_dump = fd.read().decode('utf-8')

        if not page_dump:
            url = u"http://www.kinopoisk.ru/film/%d/cast/" % page_id
            res = simple_tor_get_page(url, tor_flag=True)
            page_dump = res.decode('cp1251')
            with open(dump_path, 'w') as fdw:
                fdw.write(page_dump.encode('utf-8'))

        return page_dump

Example #25

0

Show file

def get_vote(soup):
    csslink = [
        lnk.attrs['href'] for lnk in soup.find_all('link')
        if 'votes' in lnk.attrs['href']
    ][0]
    # TODO implement caching

    r = simple_tor_get_page(csslink, tor_flag=True)

    css = r
    m = re.search('[.]starbar[ ]{width[:][ ](?P<width>[0-9]+)px', css)
    parent_width = float(m.groupdict()['width'])

    starbar_div = soup.select('div.starbar_w')
    child_width = float(
        dict([i.split(':') for i in starbar_div[0].attrs['style'].split(';')
              ])['width'].replace('px', ''))

    return round(child_width / parent_width * 10, 2)

Example #26

0

Show file

File: parse_actors.py Project: tumani1/vsevi

    def acquire_page(page_id, force_reload=False):
        if not os.path.exists(PAGE_ARCHIVE):
            os.mkdir(PAGE_ARCHIVE)

        page_dump = ''
        dump_path = os.path.join(PAGE_ARCHIVE, str(page_id))
        if os.path.exists(dump_path):
            with open(dump_path) as fd:
                page_dump = fd.read().decode('utf-8')

        if not page_dump or force_reload:
            url = u"http://www.kinopoisk.ru/film/%d/cast/" % page_id
            res = simple_tor_get_page(url, tor_flag=True)
            page_dump = res.decode('cp1251')

            with open(dump_path, 'w') as fdw:
                fdw.write(page_dump.encode('utf-8'))

        return page_dump

Example #27

0

Show file

File: robot.py Project: andrewFisherUa/videobase

 def get_data(self):
     try:
         films = json.loads(
             self.response)['live_search']['search_movies_result']
         for film in films:
             if film['rus_title'] == self.film.name:
                 film_link = 'https://www.ayyo.ru/movies/%s/' % (
                     film['slug'])
                 ayyo_film_id = film['movie']
                 break
         film_url = 'https://www.ayyo.ru/api/movies/?{}'.format(
             urllib.urlencode({'id__in': ayyo_film_id}))
         film_response = simple_tor_get_page(film_url)
         price = float(
             json.loads(film_response)['movies']['data'][str(ayyo_film_id)]
             ['streaming_price'])
         d = self.film_dict(self.film, film_link, price)
         save_location(**d)
     except Exception, e:
         pass

Example #28

0

Show file

File: robot.py Project: tumani1/vsevi

 def get_data(self):
     locations = {
     'info': [],
     'type': 'www.ayyo.ru'
             }
     try:
         films = json.loads(self.response)['live_search']['search_movies_result']
         print films
         for film in films:
             if film['rus_title'].lower().strip().encode('utf-8').translate(None, string.punctuation) == self.film.name.lower().strip().encode('utf-8').translate(None, string.punctuation):
                 film_link = 'https://www.ayyo.ru/movies/%s/' % (film['slug'])
                 ayyo_film_id = film['movie']
                 break
         film_url = 'https://www.ayyo.ru/api/movies/?{}'.format(urllib.urlencode({'id__in': ayyo_film_id}))
         film_response = simple_tor_get_page(film_url)
         price = float(json.loads(film_response)['movies']['data'][str(ayyo_film_id)]['streaming_price'])
         d = self.film_dict(self.film, film_link, price)
         one_loc_res = save_location(**d)
         save_existed_location_to_locs_dict(locations, one_loc_res)
     except Exception, e:
         pass

Example #29

0

Show file

def kinopoisk_films(pages):
    try:
        for page in range(1, pages+1):
            print u"Page number: {0} of {1}".format(page, pages)
            html = simple_tor_get_page(KINOPOISK_LIST_FILMS_URL.format(page), tor_flag=True)
            soup = BeautifulSoup(html)
            films_list = soup.findAll('div', attrs={'class': 'name'})
            for film in films_list:
                name = film.a.text
                print u"Film name: {0}".format(name)
                kinopoisk_id = int(film.a.get('href').split('/')[4])
                if u'(сериал)' in name:
                    name = name.replace(u'(сериал)', u'')
                film, flag = Films.objects.get_or_create(kinopoisk_id=kinopoisk_id,
                                                    defaults={'type': '', 'name':name})
                print u"Film: {0} {1}".format(film.name, film.kinopoisk_id)
                kinopoisk_parse_one_film.apply_async((film.kinopoisk_id, film.name))
                persons_films_update_with_indexes.apply_async((film.kinopoisk_id,))
    except Exception, e:
        import traceback
        traceback.print_exc()

Example #30

0

Show file

File: parse_actors.py Project: tumani1/vsevi

def update_kinopoisk_persone(pid):
    try:
        response = simple_tor_get_page('http://www.kinopoisk.ru/name/{}/view_info/ok/#trivia'.format(pid), True)

        soup = BeautifulSoup(response)
        tag = soup.find('span', attrs={'itemprop': 'alternativeHeadline'})
        orig_name = tag.text.strip()

        p = Persons.objects.get(kinopoisk_id=pid)
        tag_birthdate = soup.find('td', attrs={'class': 'birth'})

        birthdate = ''
        print "ID = ", p.id
        if not (tag_birthdate is None):
            birthdate = tag_birthdate.get('birthdate')
        else:
            print 'No data birthdate for this person id = {}'.format(pid)

        bio = ''
        tags_bio = soup.findAll('li', attrs={'class': 'trivia'})
        if len(tags_bio):
            for li in tags_bio:
                bio = bio + ' ' + li.text
        else:
            print 'No biography for this person id = {}'.format(pid)

        p.bio = bio
        p.kinopoisk_id = pid
        p.name_orig = orig_name
        p.birthdate = birthdate

        if p.photo == '' and p.kinopoisk_id != 0:
            p.photo.save('profile.jpg', File(get_photo(p.kinopoisk_id)))

        p.save()
    except Exception, e:
        traceback_own(e)

Example #31

0

Show file

File: tvzor_news.py Project: tumani1/vsevi

def parse_tvzor_news():

    # Get page
    page = simple_tor_get_page(NEW_URL)

    # Constant for punctuation
    punctuation = string.punctuation
    trans = string.maketrans(punctuation, ' ' * len(punctuation))

    # String constant
    pattern = '!"#$%&\'()*+,-./:;<=>–№«»?@[\\]^_`{|}~ ' # ascii: 160

    cursor = connection.cursor()

    #Get films from page in json format
    films = json.loads(page)

    news = []
    for film in films:
        try:
            # Film name
            name = film['name'].lower().strip().encode('utf-8').translate(trans)

            # Film year
            year = int(film['releaseDate'])

            query = """SELECT * FROM (SELECT films.name, films.id, EXTRACT(YEAR FROM films.release_date) AS year,
                regexp_split_to_array(trim(both lower(translate(films.name, E%s, %s))), E'\\s+') AS new_name
                FROM films) AS t WHERE t.year=%s AND t.new_name=regexp_split_to_array(%s, E'\\s+')"""

            cursor.execute(query, [pattern, ' ' * len(pattern), year, name])
            result = dict_fetch_all(cursor)

            if len(result) == 1:
                link = HOST + '/movie/' + film['assetId']
                news.append({'film_id': result[0]['id'], 'url': link})

        except Exception, e:
            print e.message

Example #32

0

Show file

File: datarobots_tasks.py Project: tumani1/vsevi

def kinopoisk_films(pages):
    try:
        for page in range(1, pages+1):
            print u"Page number: {0} of {1}".format(page, pages)
            html = simple_tor_get_page(KINOPOISK_LIST_FILMS_URL.format(page), tor_flag=True)
            soup = BeautifulSoup(html)
            films_list = soup.findAll('div', attrs={'class': 'name'})
            for film in films_list:
                kinopoisk_id = int(film.a.get('href').split('/')[4])

                name = film.a.text
                print u"Film name: {0}".format(name)
                if u'(сериал)' in name:
                    name = name.replace(u'(сериал)', u'')

                film, flag = Films.objects.get_or_create(kinopoisk_id=kinopoisk_id,
                                                    defaults={'type': '', 'name':name})
                print u"Film: {0} {1}".format(film.name, film.kinopoisk_id)

                kinopoisk_parse_one_film.apply_async((film.kinopoisk_id, film.name))
                persons_films_update_with_indexes.apply_async((film.kinopoisk_id,))
    except Exception, e:
        traceback_own(e)

Example #33

0

Show file

File: robot.py Project: tumani1/vsevi

 def __init__(self, film_id):
     self.film = Films.objects.get(id=film_id)
     search_film = urllib.urlencode({'text': (self.film.name.encode('utf-8'))})
     search_url = URL_SEARCH.format(search_film)
     url = "https://%s/%s" % (HOST, search_url, )
     self.response = simple_tor_get_page(url)

Example #34

0

Show file

File: kinopoisk_json_robot.py Project: tumani1/vsevi

def get_data_dict(kinopoisk_id):
    url = 'http://www.kinopoisk.ru/handler_trailer_popup.php?ids={id}'.format(id=kinopoisk_id)
    content = simple_tor_get_page(url)
    return json.loads(content)

Example #35

0

Show file

def get_data_dict(kinopoisk_id):
    content = simple_tor_get_page(
        'http://www.kinopoisk.ru/handler_trailer_popup.php?ids={}'.format(
            kinopoisk_id))
    return json.loads(content)

Example #36

0

Show file

File: translation_championat.py Project: tumani1/vsevi

                time = re.findall(ur'\d+', dates[1].text)

                #Get price
                price_tag = trans.find('div', {'class': '_paid'})
                price = re.search(ur'\d+', price_tag.text).group()

                #Get link
                link_tag = trans.find('a', {'class': 'broadcast__table__link'})
                link = TRANSLATION_URL + link_tag.get('href')

                #Get championship
                championship_img = trans.find('td', {'class': '_icon'}).div.get('class')
                championship = champ_dict[championship_img[-1]]

                #Get value from translation page
                trans_page = simple_tor_get_page(link)
                trans_soup = BeautifulSoup(trans_page)
                value_div = trans_soup.find('div', {'class': 'broadcast'})
                value = value_div.iframe.get('src')

                #Create dict with information about translation
                translation_data = {
                    'title': title,
                    'date': timezone.datetime(year=current_year, month=int(date[1]), day=int(date[0]), hour=int(time[0]),
                                              minute=int(time[1])),
                    'price': float(price),
                    'link': link,
                    'meta': {'championship': championship if championship else None},
                    'value': value,
                }
                translation_list.append(translation_data)

Example #37

0

Show file

File: parse_page.py Project: tumani1/vsevi

def get_image(template, actor_id):
    try:
        result = simple_tor_get_page(template.format(actor_id), tor_flag=False)
        return convert_file(result)
    except Exception, e:
        traceback_own(e)