コード例 #1
0
ファイル: Main.py プロジェクト: serbra/ru
  def Load_Page(self, url, i_rubric, Data):
    #-- get movie info
    html = self.Auth.get_HTML(url)
    #-- parsing web page
    soup = BeautifulSoup(html, fromEncoding="windows-1251")
    #-- check if page have video
    if len(soup.findAll('object', {'type':'application/x-shockwave-flash'})) < 1:
        return

    #-- get movie info
    rec = soup.find('div', {'class' : 'post'})

    #-- get image
    try:
        i_image = rec.find('div', {'class' : 'post_content'}).find('img')['src']
    except:
        try:
            i_image = re.compile('src="(.+?)"', re.MULTILINE|re.DOTALL).findall(str(rec.find('div', {'class' : 'post_content'}).find('img')))
        except:
            print '**** IMG!!'
            return empty
    if i_image.find('http://') == -1:
        i_image = 'http://fepcom.net'+i_image

    #-- get name
    i_name = unescape(rec.find('h1').text)
    #-- get url
    i_url = url

    #-- get movie info
    info = rec.find('div', {'class' : 'post_content'})

    o_name      = '-'
    i_year      = '-'
    i_country   = '-'
    i_genre     = '-'
    i_director  = '-'
    i_actors    = '-'
    i_text      = '-'

    for inf in info.findAll("strong"):
        header = inf.text.replace(':', '').encode('utf-8')
        if header == 'Оригинальное название':
            o_name = unescape(str(inf.nextSibling).strip())
        elif header == 'Год выхода на экран':
            i_year = unescape(str(inf.nextSibling).strip())
        elif header == 'Страна':
            i_country = unescape(str(inf.nextSibling).strip())
        elif header == 'Фильм относится к жанру':
            i_genre = unescape(str(inf.nextSibling).strip())
        elif header == 'Постановщик':
            i_director = unescape(str(inf.nextSibling).strip())
        elif header == 'Актеры, принявшие участие в съемках':
            i_actors = unescape(str(inf.nextSibling).strip())
        elif header == 'Краткое описание':
            i_text = unescape(str(inf.nextSibling))

    if i_name == o_name:
        o_name = ''

    full_text = i_text
    if o_name != '':
        full_text = full_text+(u'\nОригинальное название: ')+o_name
    if i_actors != '':
        full_text = full_text+(u'\nАктеры: ')+i_actors

    movie_id = f_md5((i_name + i_year).encode('utf-8')).hexdigest()
    movie = (movie_id, i_name, o_name, i_url, i_year, i_director, i_actors, i_country.title(), i_text, i_image, i_genre.title(), i_rubric)

    if Data.is_Serial_exist(movie_id) == False:
        data.add_Serial(movie)

        for c in i_country.replace('-',',').replace('/',',').replace('.',',').title().split(','):
            Data.add_Country(c.strip())

        for g in i_genre.title().split(','):
            Data.add_Genre(g.strip())

        print i_name.encode('utf-8')
コード例 #2
0
ファイル: Main.py プロジェクト: Backmute/seppius-xbmc-repo
  def Load_Page(self, i, Data):
    global Update_flag

    url='http://serialu.net/page/'+ str(i)+'/'
    #---
    html = self.Auth.get_HTML(url)
    html_container = re.compile('<div class="container">(.+?)<div class="navigation">', re.MULTILINE|re.DOTALL).findall(html)

    # -- parsing web page ----------
    soup = BeautifulSoup(''.join(html_container[0].replace('<p>', '  ').replace('</p>', '')))

    serials = soup.findAll("div", { "class" : "entry" })
    for ser in serials:
        if Update_flag == 'OFF': return
        try:
            # check if process was cancelled
            # --
            i_name  = self.unescape(ser.find("h2").find("a").text.strip())
            i_url   = ser.find("h2").find("a")["href"]
            #-- detail info
            i_rubric = ''
            for r in ser.find('div', {'class':'cat'}).findAll('a', {'rel':"category tag"}):
                Data.add_Rubric(r.text.capitalize())
                i_rubric = i_rubric + r.text.capitalize() +', '

            info = ser.find("div", { "class" : "content" })
            try:
                i_image = info.find("img")["src"]
            except:
                ser_name = i_name.replace(u'”', u'"').replace(u'“',u'"').replace(u'«',u'"').replace(u'»',u'"')
                search_mask = '<img .+alt="'+ser_name+'"(.+?) src="(.+?)"'
                img_alt = re.compile(search_mask, re.MULTILINE|re.DOTALL).findall(unicode(html, 'utf-8'))
                try:
                    i_image = img_alt[0][1]
                except:
                    i_image = '-'
                    print ('  '+i_name + u' - image not found').encode('utf-8')
                    print ser.encode('utf-8')

            o_name      = '-'
            i_year      = '-'
            i_country   = '-'
            i_genre     = '-'
            i_director  = '-'
            i_actors    = '-'
            i_text      = '-'

            for inf in info.findAll("strong"):
                if inf.text.encode('utf-8') == 'Оригинальное название:':
                    o_name = self.unescape(str(inf.nextSibling).strip())
                elif inf.text.encode('utf-8') == 'Год выхода на экран:':
                    i_year = self.unescape(str(inf.nextSibling).strip())
                elif inf.text.encode('utf-8') == 'Страна:':
                    i_country = self.unescape(str(inf.nextSibling).strip())
                elif inf.text.encode('utf-8') == 'Сериал относится к жанру:':
                    i_genre = self.unescape(str(inf.nextSibling).strip())
                elif inf.text.encode('utf-8') == 'Постановщик':
                    i_director = self.unescape(str(inf.nextSibling).strip())
                elif inf.text.encode('utf-8') == 'Актеры, принявшие участие в съемках:':
                    i_actors = self.unescape(str(inf.nextSibling).strip())
                elif inf.text.encode('utf-8') == 'Краткое описание:':
                    i_text = self.unescape(str(inf.nextSibling))
                elif inf.text.encode('utf-8') == 'Сериал относится к жанру:':
                    i_genre = self.unescape(str(inf.nextSibling))

            if i_name == o_name:
                o_name = ''

            full_text = i_text
            if o_name != '':
                full_text = full_text+(u'\nОригинальное название: ')+o_name
            if i_actors != '':
                full_text = full_text+(u'\nАктеры: ')+i_actors

            serial_id = self.f_md5((i_name + i_year).encode('utf-8')).hexdigest()

            rec = (serial_id, i_name, o_name, i_url, i_year, i_director, i_actors, i_country.title(), i_text, i_image, i_genre.title(), i_rubric)
            if Data.is_Serial_exist(serial_id) == False:
                Data.add_Serial(rec)

                for c in i_country.replace('-',',').replace('/',',').replace('.',',').title().split(','):
                    Data.add_Country(c.strip())

                for g in i_genre.title().split(','):
                    Data.add_Genre(g.strip())

                print i_name.encode('utf-8')
        except:
            pass
コード例 #3
0
ファイル: Main.py プロジェクト: serbra/ru
    def Load_Page(self, i, Data):
        global Update_flag

        url = 'http://serialu.net/page/' + str(i) + '/'
        #---
        html = self.Auth.get_HTML(url)
        html_container = re.compile(
            '<div class="container">(.+?)<div class="navigation">',
            re.MULTILINE | re.DOTALL).findall(html)

        # -- parsing web page ----------
        soup = BeautifulSoup(''.join(html_container[0].replace(
            '<p>', '  ').replace('</p>', '')))

        serials = soup.findAll("div", {"class": "entry"})
        for ser in serials:
            if Update_flag == 'OFF': return
            try:
                # check if process was cancelled
                # --
                i_name = self.unescape(ser.find("h2").find("a").text.strip())
                i_url = ser.find("h2").find("a")["href"]
                #-- detail info
                i_rubric = ''
                for r in ser.find('div', {
                        'class': 'cat'
                }).findAll('a', {'rel': "category tag"}):
                    Data.add_Rubric(r.text.capitalize())
                    i_rubric = i_rubric + r.text.capitalize() + ', '

                info = ser.find("div", {"class": "content"})
                try:
                    i_image = info.find("img")["src"]
                except:
                    ser_name = i_name.replace(u'”', u'"').replace(
                        u'“', u'"').replace(u'«', u'"').replace(u'»', u'"')
                    search_mask = '<img .+alt="' + ser_name + '"(.+?) src="(.+?)"'
                    img_alt = re.compile(search_mask,
                                         re.MULTILINE | re.DOTALL).findall(
                                             unicode(html, 'utf-8'))
                    try:
                        i_image = img_alt[0][1]
                    except:
                        i_image = '-'
                        print('  ' + i_name +
                              u' - image not found').encode('utf-8')
                        print ser.encode('utf-8')

                o_name = '-'
                i_year = '-'
                i_country = '-'
                i_genre = '-'
                i_director = '-'
                i_actors = '-'
                i_text = '-'

                for inf in info.findAll("strong"):
                    if inf.text.encode('utf-8') == 'Оригинальное название:':
                        o_name = self.unescape(str(inf.nextSibling).strip())
                    elif inf.text.encode('utf-8') == 'Год выхода на экран:':
                        i_year = self.unescape(str(inf.nextSibling).strip())
                    elif inf.text.encode('utf-8') == 'Страна:':
                        i_country = self.unescape(str(inf.nextSibling).strip())
                    elif inf.text.encode(
                            'utf-8') == 'Сериал относится к жанру:':
                        i_genre = self.unescape(str(inf.nextSibling).strip())
                    elif inf.text.encode('utf-8') == 'Постановщик':
                        i_director = self.unescape(
                            str(inf.nextSibling).strip())
                    elif inf.text.encode(
                            'utf-8') == 'Актеры, принявшие участие в съемках:':
                        i_actors = self.unescape(str(inf.nextSibling).strip())
                    elif inf.text.encode('utf-8') == 'Краткое описание:':
                        i_text = self.unescape(str(inf.nextSibling))
                    elif inf.text.encode(
                            'utf-8') == 'Сериал относится к жанру:':
                        i_genre = self.unescape(str(inf.nextSibling))

                if i_name == o_name:
                    o_name = ''

                full_text = i_text
                if o_name != '':
                    full_text = full_text + (
                        u'\nОригинальное название: ') + o_name
                if i_actors != '':
                    full_text = full_text + (u'\nАктеры: ') + i_actors

                serial_id = self.f_md5(
                    (i_name + i_year).encode('utf-8')).hexdigest()

                rec = (serial_id, i_name, o_name, i_url, i_year, i_director,
                       i_actors, i_country.title(), i_text, i_image,
                       i_genre.title(), i_rubric)
                if Data.is_Serial_exist(serial_id) == False:
                    Data.add_Serial(rec)

                    for c in i_country.replace('-', ',').replace(
                            '/', ',').replace('.', ',').title().split(','):
                        Data.add_Country(c.strip())

                    for g in i_genre.title().split(','):
                        Data.add_Genre(g.strip())

                    print i_name.encode('utf-8')
            except:
                pass