def Load_Page(self, url, i_rubric, Data): #-- get movie info html = self.Auth.get_HTML(url) #-- parsing web page soup = BeautifulSoup(html, fromEncoding="windows-1251") #-- check if page have video if len(soup.findAll('object', {'type':'application/x-shockwave-flash'})) < 1: return #-- get movie info rec = soup.find('div', {'class' : 'post'}) #-- get image try: i_image = rec.find('div', {'class' : 'post_content'}).find('img')['src'] except: try: i_image = re.compile('src="(.+?)"', re.MULTILINE|re.DOTALL).findall(str(rec.find('div', {'class' : 'post_content'}).find('img'))) except: print '**** IMG!!' return empty if i_image.find('http://') == -1: i_image = 'http://fepcom.net'+i_image #-- get name i_name = unescape(rec.find('h1').text) #-- get url i_url = url #-- get movie info info = rec.find('div', {'class' : 'post_content'}) o_name = '-' i_year = '-' i_country = '-' i_genre = '-' i_director = '-' i_actors = '-' i_text = '-' for inf in info.findAll("strong"): header = inf.text.replace(':', '').encode('utf-8') if header == 'Оригинальное название': o_name = unescape(str(inf.nextSibling).strip()) elif header == 'Год выхода на экран': i_year = unescape(str(inf.nextSibling).strip()) elif header == 'Страна': i_country = unescape(str(inf.nextSibling).strip()) elif header == 'Фильм относится к жанру': i_genre = unescape(str(inf.nextSibling).strip()) elif header == 'Постановщик': i_director = unescape(str(inf.nextSibling).strip()) elif header == 'Актеры, принявшие участие в съемках': i_actors = unescape(str(inf.nextSibling).strip()) elif header == 'Краткое описание': i_text = unescape(str(inf.nextSibling)) if i_name == o_name: o_name = '' full_text = i_text if o_name != '': full_text = full_text+(u'\nОригинальное название: ')+o_name if i_actors != '': full_text = full_text+(u'\nАктеры: ')+i_actors movie_id = f_md5((i_name + i_year).encode('utf-8')).hexdigest() movie = (movie_id, i_name, o_name, i_url, i_year, i_director, i_actors, i_country.title(), i_text, i_image, i_genre.title(), i_rubric) if Data.is_Serial_exist(movie_id) == False: data.add_Serial(movie) for c in i_country.replace('-',',').replace('/',',').replace('.',',').title().split(','): Data.add_Country(c.strip()) for g in i_genre.title().split(','): Data.add_Genre(g.strip()) print i_name.encode('utf-8')
def Load_Page(self, i, Data): global Update_flag url='http://serialu.net/page/'+ str(i)+'/' #--- html = self.Auth.get_HTML(url) html_container = re.compile('<div class="container">(.+?)<div class="navigation">', re.MULTILINE|re.DOTALL).findall(html) # -- parsing web page ---------- soup = BeautifulSoup(''.join(html_container[0].replace('<p>', ' ').replace('</p>', ''))) serials = soup.findAll("div", { "class" : "entry" }) for ser in serials: if Update_flag == 'OFF': return try: # check if process was cancelled # -- i_name = self.unescape(ser.find("h2").find("a").text.strip()) i_url = ser.find("h2").find("a")["href"] #-- detail info i_rubric = '' for r in ser.find('div', {'class':'cat'}).findAll('a', {'rel':"category tag"}): Data.add_Rubric(r.text.capitalize()) i_rubric = i_rubric + r.text.capitalize() +', ' info = ser.find("div", { "class" : "content" }) try: i_image = info.find("img")["src"] except: ser_name = i_name.replace(u'”', u'"').replace(u'“',u'"').replace(u'«',u'"').replace(u'»',u'"') search_mask = '<img .+alt="'+ser_name+'"(.+?) src="(.+?)"' img_alt = re.compile(search_mask, re.MULTILINE|re.DOTALL).findall(unicode(html, 'utf-8')) try: i_image = img_alt[0][1] except: i_image = '-' print (' '+i_name + u' - image not found').encode('utf-8') print ser.encode('utf-8') o_name = '-' i_year = '-' i_country = '-' i_genre = '-' i_director = '-' i_actors = '-' i_text = '-' for inf in info.findAll("strong"): if inf.text.encode('utf-8') == 'Оригинальное название:': o_name = self.unescape(str(inf.nextSibling).strip()) elif inf.text.encode('utf-8') == 'Год выхода на экран:': i_year = self.unescape(str(inf.nextSibling).strip()) elif inf.text.encode('utf-8') == 'Страна:': i_country = self.unescape(str(inf.nextSibling).strip()) elif inf.text.encode('utf-8') == 'Сериал относится к жанру:': i_genre = self.unescape(str(inf.nextSibling).strip()) elif inf.text.encode('utf-8') == 'Постановщик': i_director = self.unescape(str(inf.nextSibling).strip()) elif inf.text.encode('utf-8') == 'Актеры, принявшие участие в съемках:': i_actors = self.unescape(str(inf.nextSibling).strip()) elif inf.text.encode('utf-8') == 'Краткое описание:': i_text = self.unescape(str(inf.nextSibling)) elif inf.text.encode('utf-8') == 'Сериал относится к жанру:': i_genre = self.unescape(str(inf.nextSibling)) if i_name == o_name: o_name = '' full_text = i_text if o_name != '': full_text = full_text+(u'\nОригинальное название: ')+o_name if i_actors != '': full_text = full_text+(u'\nАктеры: ')+i_actors serial_id = self.f_md5((i_name + i_year).encode('utf-8')).hexdigest() rec = (serial_id, i_name, o_name, i_url, i_year, i_director, i_actors, i_country.title(), i_text, i_image, i_genre.title(), i_rubric) if Data.is_Serial_exist(serial_id) == False: Data.add_Serial(rec) for c in i_country.replace('-',',').replace('/',',').replace('.',',').title().split(','): Data.add_Country(c.strip()) for g in i_genre.title().split(','): Data.add_Genre(g.strip()) print i_name.encode('utf-8') except: pass
def Load_Page(self, i, Data): global Update_flag url = 'http://serialu.net/page/' + str(i) + '/' #--- html = self.Auth.get_HTML(url) html_container = re.compile( '<div class="container">(.+?)<div class="navigation">', re.MULTILINE | re.DOTALL).findall(html) # -- parsing web page ---------- soup = BeautifulSoup(''.join(html_container[0].replace( '<p>', ' ').replace('</p>', ''))) serials = soup.findAll("div", {"class": "entry"}) for ser in serials: if Update_flag == 'OFF': return try: # check if process was cancelled # -- i_name = self.unescape(ser.find("h2").find("a").text.strip()) i_url = ser.find("h2").find("a")["href"] #-- detail info i_rubric = '' for r in ser.find('div', { 'class': 'cat' }).findAll('a', {'rel': "category tag"}): Data.add_Rubric(r.text.capitalize()) i_rubric = i_rubric + r.text.capitalize() + ', ' info = ser.find("div", {"class": "content"}) try: i_image = info.find("img")["src"] except: ser_name = i_name.replace(u'”', u'"').replace( u'“', u'"').replace(u'«', u'"').replace(u'»', u'"') search_mask = '<img .+alt="' + ser_name + '"(.+?) src="(.+?)"' img_alt = re.compile(search_mask, re.MULTILINE | re.DOTALL).findall( unicode(html, 'utf-8')) try: i_image = img_alt[0][1] except: i_image = '-' print(' ' + i_name + u' - image not found').encode('utf-8') print ser.encode('utf-8') o_name = '-' i_year = '-' i_country = '-' i_genre = '-' i_director = '-' i_actors = '-' i_text = '-' for inf in info.findAll("strong"): if inf.text.encode('utf-8') == 'Оригинальное название:': o_name = self.unescape(str(inf.nextSibling).strip()) elif inf.text.encode('utf-8') == 'Год выхода на экран:': i_year = self.unescape(str(inf.nextSibling).strip()) elif inf.text.encode('utf-8') == 'Страна:': i_country = self.unescape(str(inf.nextSibling).strip()) elif inf.text.encode( 'utf-8') == 'Сериал относится к жанру:': i_genre = self.unescape(str(inf.nextSibling).strip()) elif inf.text.encode('utf-8') == 'Постановщик': i_director = self.unescape( str(inf.nextSibling).strip()) elif inf.text.encode( 'utf-8') == 'Актеры, принявшие участие в съемках:': i_actors = self.unescape(str(inf.nextSibling).strip()) elif inf.text.encode('utf-8') == 'Краткое описание:': i_text = self.unescape(str(inf.nextSibling)) elif inf.text.encode( 'utf-8') == 'Сериал относится к жанру:': i_genre = self.unescape(str(inf.nextSibling)) if i_name == o_name: o_name = '' full_text = i_text if o_name != '': full_text = full_text + ( u'\nОригинальное название: ') + o_name if i_actors != '': full_text = full_text + (u'\nАктеры: ') + i_actors serial_id = self.f_md5( (i_name + i_year).encode('utf-8')).hexdigest() rec = (serial_id, i_name, o_name, i_url, i_year, i_director, i_actors, i_country.title(), i_text, i_image, i_genre.title(), i_rubric) if Data.is_Serial_exist(serial_id) == False: Data.add_Serial(rec) for c in i_country.replace('-', ',').replace( '/', ',').replace('.', ',').title().split(','): Data.add_Country(c.strip()) for g in i_genre.title().split(','): Data.add_Genre(g.strip()) print i_name.encode('utf-8') except: pass