def get_notes(self): self.notes = '' language = gutils.regextrim(self.page, 'Language:<[^>]+>', '</div>') language = gutils.strip_tags(language) language = re.sub('[\n]+', '', language) language = re.sub('[ ]+', ' ', language) language = language.strip() color = gutils.regextrim(self.page, 'Color:<[^>]+>', '</div>') color = gutils.strip_tags(color) color = re.sub('[\n]+', '', color) color = re.sub('[ ]+', ' ', color) color = color.strip() sound = gutils.regextrim(self.page, 'Sound Mix:<[^>]+>', '</div>') sound = gutils.strip_tags(sound) sound = re.sub('[\n]+', '', sound) sound = re.sub('[ ]+', ' ', sound) sound = sound.strip() tagline = gutils.regextrim(self.tagl_page, '>Taglines', '>See also') taglines = re.split('<div[^>]+class="soda[^>]*>', tagline) tagline = '' if len(taglines) > 1: for entry in taglines[1:]: entry = gutils.clean(gutils.before(entry, '</div>')) if entry: tagline = tagline + entry + '\n' if len(language) > 0: self.notes = "%s: %s\n" % (_('Language'), language) if len(sound) > 0: self.notes += "%s: %s\n" % (gutils.strip_tags( _('<b>Audio</b>')), sound) if len(color) > 0: self.notes += "%s: %s\n" % (_('Color'), color) if len(tagline) > 0: self.notes += "%s: %s\n" % ('Tagline', tagline)
def get_notes(self): self.notes = "" language = gutils.regextrim(self.page, "Language:<[^>]+>", "</div>") language = gutils.strip_tags(language) language = re.sub("[\n]+", "", language) language = re.sub("[ ]+", " ", language) language = language.strip() color = gutils.regextrim(self.page, "Color:<[^>]+>", "</div>") color = gutils.strip_tags(color) color = re.sub("[\n]+", "", color) color = re.sub("[ ]+", " ", color) color = color.strip() sound = gutils.regextrim(self.page, "Sound Mix:<[^>]+>", "</div>") sound = gutils.strip_tags(sound) sound = re.sub("[\n]+", "", sound) sound = re.sub("[ ]+", " ", sound) sound = sound.strip() tagline = gutils.regextrim(self.tagl_page, ">Taglines", ">See also") taglines = re.split('<div[^>]+class="soda[^>]*>', tagline) tagline = "" if len(taglines) > 1: for entry in taglines[1:]: entry = gutils.clean(gutils.before(entry, "</div>")) if entry: tagline = tagline + entry + "\n" if len(language) > 0: self.notes = "%s: %s\n" % (_("Language"), language) if len(sound) > 0: self.notes += "%s: %s\n" % (gutils.strip_tags(_("<b>Audio</b>")), sound) if len(color) > 0: self.notes += "%s: %s\n" % (_("Color"), color) if len(tagline) > 0: self.notes += "%s: %s\n" % ("Tagline", tagline)
def get_image(self): self.image_url = string.replace( string.replace(gutils.trim(self.page, '"picture":', ','), '"', ''), '\\', '') if not self.image_url: tmpdata = gutils.regextrim(self.page, '<div class="cover-area">', '</div>') if tmpdata: # video page tmpdata = re.search( '(http[:][/][/][^/]+[/]flbilder[/][^"\']+)', tmpdata) if tmpdata: self.image_url = tmpdata.group(1) else: # kino page tmpdata = gutils.before(self.page, '<span style="line-height: 15px;">') if tmpdata: tmpparts = re.split('http://images.kino.de/s/', tmpdata) if len(tmpparts) > 2: self.image_url = 'http://images.kino.de/s/' + gutils.before( tmpparts[2], '"') elif len(tmpparts) > 1: self.image_url = 'http://images.kino.de/s/' + gutils.before( tmpparts[1], '"') if not self.image_url and self.videopage: tmpdata = gutils.regextrim(self.videopage, '<div class="cover-area">', '</div>') if tmpdata: # video page tmpdata = re.search( '(http[:][/][/][^/]+[/]flbilder[/][^"\']+)', tmpdata) if tmpdata: self.image_url = tmpdata.group(1)
def get_o_title(self): self.o_title = gutils.regextrim(self.page, 'class="title-extra"[^>]*>', '<') if not self.o_title: self.o_title = gutils.regextrim(self.page, '<h1>', '([ ]|[&][#][0-9]+[;])<span') if not self.o_title: self.o_title = re.sub(' [(].*', '', gutils.trim(self.page, '<title>', '</title>')) self.o_title = re.sub('"', '', self.o_title)
def get_notes(self): self.notes = '' language = gutils.regextrim(self.page, 'Language:<[^>]+>', '</div>') language = gutils.strip_tags(language) language = re.sub('[\n]+', '', language) language = re.sub('[ ]+', ' ', language) language = language.strip() color = gutils.regextrim(self.page, 'Color:<[^>]+>', '</div>') color = gutils.strip_tags(color) color = re.sub('[\n]+', '', color) color = re.sub('[ ]+', ' ', color) color = color.strip() sound = gutils.regextrim(self.page, 'Sound Mix:<[^>]+>', '</div>') sound = gutils.strip_tags(sound) sound = re.sub('[\n]+', '', sound) sound = re.sub('[ ]+', ' ', sound) sound = sound.strip() tagline = gutils.regextrim(self.tagl_page, '>Taglines', '>See also') taglines = re.split('<div[^>]+class="soda[^>]*>', tagline) tagline = '' if len(taglines)>1: for entry in taglines[1:]: entry = gutils.clean(gutils.before(entry, '</div>')) if entry: tagline = tagline + entry + '\n' if len(language)>0: self.notes = "%s: %s\n" %(_('Language'), language) if len(sound)>0: self.notes += "%s: %s\n" %(gutils.strip_tags(_('<b>Audio</b>')), sound) if len(color)>0: self.notes += "%s: %s\n" %(_('Color'), color) if len(tagline)>0: self.notes += "%s: %s\n" %('Tagline', tagline)
def get_o_title(self): self.o_title = gutils.regextrim(self.page, 'class="title-extra"[^>]*>', "<") if not self.o_title: self.o_title = gutils.regextrim(self.page, "<h1>", "([ ]|[&][#][0-9]+[;])<span") if not self.o_title: self.o_title = re.sub(" [(].*", "", gutils.trim(self.page, "<title>", "</title>")) self.o_title = re.sub('"', "", self.o_title)
def get_country(self): self.country = gutils.regextrim(self.tmp_page, 'span class="standardsmall"[^>]*><strong>((DVD|VHS|Laser Disc|Video CD|Blue-ray Disc)</strong>[ \t]-[ \t]<strong>)*', '</span>') if self.country <> None: self.country = gutils.regextrim(self.country, '-[ \t]<strong>', '</strong>') self.country = re.sub('[0-9]+$', '', self.country) else: self.country = ''
def get_o_title(self): self.o_title = gutils.trim(self.tmp_page, 'span class="standardsmall">(', ')<') if self.o_title == '': if self.url_type == 'V': self.o_title = gutils.after(gutils.regextrim(self.tmp_page, 'headline2"[^>]*>[ \t\r\n]*<a href="/videofilm', '</a>'), '>') else: self.o_title = gutils.after(gutils.regextrim(self.tmp_page, 'headline2"[^>]*>[ \t\r\n]*<a href="/kinofilm', '</a>'), '>')
def get_image(self): self.image_url = string.replace(string.replace(gutils.trim(self.page, '"picture":', ','), '"', ''), '\\', '') if not self.image_url: tmpdata = gutils.regextrim(self.page, '<div class="cover-area">', '</div>') if tmpdata: # video page tmpdata = re.search('(http[:][/][/][^/]+[/]flbilder[/][^"\']+)', tmpdata) if tmpdata: self.image_url = tmpdata.group(1) else: # kino page tmpdata = gutils.before(self.page, '<span style="line-height: 15px;">') if tmpdata: tmpparts = re.split('http://images.kino.de/s/', tmpdata) if len(tmpparts) > 2: self.image_url = 'http://images.kino.de/s/' + gutils.before(tmpparts[2], '"') elif len(tmpparts) > 1: self.image_url = 'http://images.kino.de/s/' + gutils.before(tmpparts[1], '"') if not self.image_url and self.videopage: tmpdata = gutils.regextrim(self.videopage, '<div class="cover-area">', '</div>') if tmpdata: # video page tmpdata = re.search('(http[:][/][/][^/]+[/]flbilder[/][^"\']+)', tmpdata) if tmpdata: self.image_url = tmpdata.group(1)
def get_image(self): tmp = gutils.regextrim(self.page, 'plakat.php[?]', '["\']') if tmp: page_image = self.open_page(url='http://www.filmdb.de/plakat.php?' + tmp) tmp = gutils.regextrim(self.page, 'bilder.filmdb.de', '["\']') if tmp: self.image_url = 'http://bilder.filmdb.de' + tmp
def get_classification(self): self.classification = string.replace( gutils.trim(self.page, '"fsk":', ','), '"', '') if not self.classification: self.classification = gutils.regextrim(self.page, 'FSK: ', '<') if not self.classification and self.videopage: self.classification = gutils.regextrim(self.videopage, 'FSK: ', '<')
def get_image(self): tmp = gutils.regextrim(self.page, 'plakat.php[?]', '["\']') if tmp: page_image = self.open_page( url='http://www.filmdb.de/plakat.php?' + tmp) tmp = gutils.regextrim(self.page, 'bilder.filmdb.de', '["\']') if tmp: self.image_url = 'http://bilder.filmdb.de' + tmp
def get_notes(self): self.notes = '' critica = gutils.clean(string.replace(gutils.regextrim(self.page, 'Critica</font>', "(</td>|\n|Note<)"), '<br>', '\n')) if critica: self.notes = 'Critica:\n\n' + critica + '\n\n' note = gutils.clean(string.replace(gutils.regextrim(self.page, 'Note</font>', "(</td>|\n|Critica<)"), '<br>', '--BR--')) if note: # string.capwords removes line breaks, preventing them with placeholder --BR-- note = self.capwords(note) self.notes = self.notes + 'Note:\n\n' + string.replace(note, '--br--', '\n')
def get_classification(self): # until we can find a way to locate the user, we have to use the US-classification self.classification = gutils.trim(self.page, '<meta itemprop="contentRating" content="', '"') if not self.classification: classificationList = gutils.regextrim(self.cert_page,'id="certifications-list"','<\/ul>') if classificationList: self.classification = gutils.regextrim(classificationList,'>United States:','<') else: # the old way self.classification = gutils.trim(self.cert_page, '>Certification:<', '</div>') self.classification = gutils.trim(self.classification, '>USA:', '<')
def get_o_title(self): self.o_title = gutils.regextrim(self.page, '(<p>Originaltitel[:] |Originaltitel<[^>]+>)', '(</tr>|</p>)') if not self.o_title: self.o_title = gutils.trim(self.page, '<h1(', ')') if not self.o_title: self.o_title = gutils.trim(self.page, '<div class="teaser">', '</') if not self.o_title: if self.videopage: self.o_title = gutils.trim(self.videopage, '<p>Originaltitel: ', '</p>') if not self.o_title: self.o_title = gutils.regextrim(self.page, '<h1>', '(</h1>|</span>)')
def get_searches(self): if string.find(self.page, '<title>Suche') > 0: elements = string.split(self.page, "hit.php3?hit=") elements[0] = '' for element in elements: if element <> '': id = gutils.trim(element, 'movie-', '-') if id <> '': self.ids.append(id) self.titles.append(gutils.strip_tags(string.replace(gutils.regextrim(element, '>', '</[Aa]>'), '<br />', ' - '))) else: id = gutils.regextrim(self.page, 'index[.]php3[?]id=', '("|;|\')') self.ids.append(id)
def get_plot(self): self.plot = gutils.trim(self.page, '<b class="ch">Resumen', '<a href="/rg/title-tease/plot') self.plot = gutils.after(self.plot, ':</b> ') self.plot = gutils.trim(self.page, '<h5>Trama:</h5>', '</div>') self.plot = self.__before_more(self.plot) tmp = gutils.trim(self.plot_page, '<div id="swiki.2.1">', '</div>') if tmp: self.plot = tmp elements = string.split(self.plot_page, '<p class="plotpar">') if len(elements) > 1: self.plot = self.plot + '\n\n' elements[0] = '' for element in elements: if element != '': self.plot = self.plot + gutils.strip_tags(gutils.before(element, '</a>')) + '\n' if not self.plot: # nothing in spanish found, try original self.plot = gutils.regextrim(self.imdb_page, '<h5>Plot:</h5>', '(</div>|<a href.*)') self.plot = self.__before_more(self.plot) elements = string.split(self.imdb_plot_page, '<p class="plotpar">') if len(elements) > 1: self.plot = self.plot + '\n\n' elements[0] = '' for element in elements: if element <> '': self.plot = self.plot + gutils.strip_tags(gutils.before(element, '</a>')) + '\n\n'
def get_cameraman(self): self.cameraman = gutils.regextrim(self.creditspage, u"zdjęcia: <", "(</tr>|<tr>)") self.cameraman = gutils.after(self.cameraman, ">") self.cameraman = self.cameraman.replace("<br />", ", ") self.cameraman = gutils.clean(self.cameraman) if self.cameraman.endswith(","): self.cameraman = self.cameraman[:-1]
def get_plot(self): self.plot = '' storyid = gutils.regextrim(self.page, '<a href="plot/', '(">|[&])') if not storyid is None: story_page = self.open_page(url="https://ssl.ofdb.de/plot/%s" % (storyid.encode('utf8'))) if story_page: self.plot = gutils.trim(story_page, "</b><br><br>","</")
def get_plot(self): self.plot = '' storyid = gutils.regextrim(self.page, '<a href="plot/', '(">|[&])') if not storyid is None: story_page = self.open_page(url="http://www.ofdb.de/plot/%s" % (storyid.encode('utf8'))) if story_page: self.plot = gutils.trim(story_page, "</b><br><br>","</")
def get_cast(self): self.cast = gutils.regextrim(self.page, '[(]Darsteller[)]', '(<[pP]>|<br><span[^>]+>)') self.cast = gutils.clean(self.cast) self.cast = self.cast.replace(' als ', _(' as ')) self.cast = re.sub('( \t|\t|\r|\n)', '', self.cast) self.cast = self.cast.replace(', ', '\n') self.cast = self.cast.replace(',', '')
def get_year(self): self.year = '' tmp = gutils.regextrim(self.tmp_page, 'span class="standardsmall"[^>]*><strong>', '</span>') if tmp <> None: srchresult = re.search('[0-9][0-9][0-9][0-9]</strong>', tmp) if srchresult <> None: self.year = srchresult.string[srchresult.start():srchresult.end()]
def get_searches(self): elements = re.split(' <a title="[^"]+" href="(/datenbank/medien/dvd/|/datenbank/medien/blu-ray/)', self.page) elements[0] = None for index in range(1, len(elements), 2): element = elements[index + 1] if element <> None: if elements[index] == '/datenbank/medien/blu-ray/': medium = 'Blu-Ray' self.ids.append('blu-ray/' + gutils.before(element,'"')) else: medium = 'DVD' self.ids.append('dvd/' + gutils.before(element,'"')) self.titles.append( gutils.trim(element, '>', '</a>') + gutils.clean( '(' + medium + ' - ' + re.sub('[ \t\n]+', ' ', string.replace( string.replace( gutils.regextrim(element, '<div [^>]*>', '</div>'), '<br>', ' - '), ' ', '')) + ')' ) )
def get_image(self): # Find the film's poster image tmp_poster = gutils.regextrim(self.page, "../images_locandine/%s/" % self.movie_id, ".(JPG|jpg)\"") if tmp_poster != "": self.image_url = "http://www.cinematografo.it/bancadati/images_locandine/%s/%s.jpg" % (self.movie_id, tmp_poster) else: self.image_url = ""
def get_plot(self): self.plot = re.sub( '[0-9 ]+Views', '', re.sub( '[]', '-', re.sub( '[\x93]', '"', gutils.regextrim(self.page, 'showcover.php[^>]*>', '</td>'))))
def get_cast(self): self.cast = gutils.regextrim(self.page, '[(]Darsteller[)]', '(</td>|<br><span[^>]+>)') self.cast = gutils.clean(self.cast) self.cast = self.cast.replace(' als ', _(' as ')) self.cast = re.sub('( \t|\t|\r|\n)', '', self.cast) self.cast = self.cast.replace(',', '\n')
def get_plot(self): plotlist = string.split( gutils.trim(self.plot_page, 'id="plot-summaries-content">', '</ul>'), '<li') plotcompilation = '' for listelement in plotlist: if listelement <> '' and not 'It looks like we don\'t have any Plot Summaries for this title yet.' in listelement: plotcompilation = plotcompilation + gutils.trim( listelement, '<p>', '</p>') + '\n' plotcompilation = plotcompilation + re.sub( '<[^<]+?>', '', gutils.trim(listelement, '<div class="author-container">', '</div>').replace('\n', '').lstrip()) + '\n\n' if plotcompilation <> '': self.plot = plotcompilation else: self.plot = gutils.regextrim(self.page, 'itemprop="description"', '<') self.plot = gutils.after(self.plot, '>') elements = string.split(self.plot_page, '<p class="plotpar">') if len(elements) < 2: elements = re.split('<li class="(?:odd|even)">', self.plot_page) if len(elements) > 1: self.plot = self.plot + '\n\n' elements[0] = '' for element in elements[1:]: if element <> '': self.plot = self.plot + gutils.strip_tags( gutils.before(element, '</a>')) + '\n\n'
def get_searches(self): if string.find(self.page, '<title>Suche') > 0: elements = string.split(self.page, "hit.php3?hit=") elements[0] = '' for element in elements: if element <> '': id = gutils.trim(element, 'movie-', '-') if id <> '': self.ids.append(id) self.titles.append( gutils.strip_tags( string.replace( gutils.regextrim(element, '>', '</[Aa]>'), '<br />', ' - '))) else: id = gutils.regextrim(self.page, 'index[.]php3[?]id=', '("|;|\')') self.ids.append(id)
def get_o_title(self): self.o_title = gutils.regextrim( self.page, '(<p>Originaltitel[:] |Originaltitel<[^>]+>)', '(</tr>|</p>)') if not self.o_title: self.o_title = gutils.trim(self.page, '<h1(', ')') if not self.o_title: self.o_title = gutils.trim(self.page, '<div class="teaser">', '</') if not self.o_title: if self.videopage: self.o_title = gutils.trim(self.videopage, '<p>Originaltitel: ', '</p>') if not self.o_title: self.o_title = gutils.regextrim( self.page, '<h1>', '(</h1>|</span>)')
def get_o_title(self): self.o_title = gutils.trim(self.page, '<p>Originaltitel: ', '</p>') if not self.o_title: self.o_title = gutils.trim(self.page, '<span class="standardsmall">(', ')') if not self.o_title: self.o_title = gutils.trim(self.page, '<div class="teaser">', '</') if not self.o_title: self.o_title = gutils.regextrim(self.page, '<title>', '([|]|</title>)')
def get_o_title(self): self.o_title = string.capwords( gutils.clean( gutils.regextrim(self.page, '<b>Originaltitel:', '(</p>|<b>)'))) if not self.o_title: self.o_title = gutils.after( gutils.trim(self.page, 'class=\'film-titel\'', '</h1>'), '>')
def get_cameraman(self): self.cameraman = gutils.regextrim(self.creditspage, u'zdjęcia: <', '(</tr>|<tr>)') self.cameraman = gutils.after(self.cameraman, '>') self.cameraman = self.cameraman.replace('<br />', ', ') self.cameraman = gutils.clean(self.cameraman) if self.cameraman.endswith(','): self.cameraman = self.cameraman[:-1]
def get_o_title(self): self.o_title = gutils.trim(self.o_page, '<span class="title-extra">', '<i>(original title)</i>') if self.o_title == '': self.o_title = gutils.regextrim(self.o_page, '<h1>', '([ ]|[&][#][0-9]+[;])<span') if self.o_title == '': self.o_title = re.sub( '[(].*', '', gutils.trim(self.o_page, '<title>', '</title>'))
def get_title(self): self.title = gutils.trim(self.page, '<h1>', '<span') elements = string.split(gutils.regextrim(self.page, '<h5>(Alternativ|Auch bekannt als):', '</div>'), '<i class="transl"') if len(elements) > 1: for element in elements: tmp = gutils.before(gutils.trim(element, '>', '[de]'), '(') if tmp <> '': self.title = tmp break
def get_runtime(self): self.runtime = gutils.strip_tags( gutils.regextrim(self.page, 'Runtime<[^>]+>', 'min<')) tmp = string.split(self.runtime, 'h ') if len(tmp) > 1: try: self.runtime = int(tmp[0]) * 60 + int(tmp[1]) except: None
def get_o_title(self): self.o_title = gutils.clean(gutils.after( gutils.regextrim(self.page, 'class="text_ergebniss_faz_3"', '[ \t]+[(]Originaltitel[)]'), '</a>')) p1 = string.rfind(self.o_title, ',') if p1 > 0: self.o_title = self.o_title[p1 + 1:] self.o_title = string.capwords(self.o_title) if self.o_title == '': self.o_title = gutils.after(gutils.trim(self.page, 'class="text_ergebniss_faz_3"', '</a>'), '>')
def get_studio(self): self.studio = '' tmp = gutils.regextrim(self.comp_page, 'Production Companies<[^>]+', '</ul>') tmp = string.split(tmp, 'href="') for entry in tmp: entry = gutils.trim(entry, '>', '<') if entry: self.studio = self.studio + entry + ', ' if self.studio: self.studio = self.studio[:-2]
def get_cameraman(self): self.cameraman = '' tmp = gutils.regextrim(self.cast_page, 'Cinematography by<[^>]+', '</table>') tmp = string.split(tmp, 'href="') for entry in tmp: entry = gutils.trim(entry, '>', '<') if entry: self.cameraman = self.cameraman + entry + ', ' if self.cameraman: self.cameraman = self.cameraman[:-2]
def get_studio(self): self.studio = '' tmp = gutils.regextrim(self.page, '>Production Company<', '(<B>|</TABLE>)') elements = re.split('(href|HREF)="/db/companies', tmp) for element in elements: element = gutils.clean(gutils.trim(element, '>', '<')) if element: self.studio = self.studio + element + ', ' if self.studio: self.studio = self.studio[:-2]
def get_plot(self): self.plot = gutils.regextrim(self.page, '<h5>Plot:</h5>', '(</div>|<a href.*)') self.plot = self.__before_more(self.plot) elements = string.split(self.plot_page, '<p class="plotpar">') if len(elements) > 1: self.plot = self.plot + '\n\n' elements[0] = '' for element in elements: if element <> '': self.plot = self.plot + gutils.strip_tags(gutils.before(element, '</a>')) + '\n\n'
def get_screenplay(self): self.screenplay = gutils.regextrim(self.page, '<th>Buch:', '<th>') if not self.screenplay: self.screenplay= gutils.trim(self.creditspage, 'Drehbuch: ', '</tr>') if not self.screenplay: elements = re.split('<h3>Drehbuch</h3>', self.page) delimiter = '' for element in elements[1:]: self.screenplay = self.screenplay + delimiter + gutils.trim(element, 'itemprop="name">', '<') delimiter = ', '
def get_cameraman(self): self.cameraman = gutils.regextrim(self.page, '<th>Kamera:', '(<th>|</table>)') if not self.cameraman: self.cameraman= gutils.trim(self.creditspage, 'Kamera ', '</tr>') if not self.cameraman: elements = re.split('"function-title">Kamera</span>', self.page) delimiter = '' for element in elements[1:]: self.cameraman = self.cameraman + delimiter + gutils.trim(element, 'itemprop="name">', '<') delimiter = ', '
def get_cameraman(self): # OK v0.1 self.cameraman = '' tmp = gutils.regextrim(self.cast_page, 'Image<[^>]+', '</table>') tmp = string.split(tmp, 'href="') for entry in tmp: entry = gutils.trim(entry, '>', '<') if entry: self.cameraman = self.cameraman + entry + ', ' if self.cameraman: self.cameraman = self.cameraman[:-2]
def get_cameraman(self): self.cameraman = '' tmp = gutils.regextrim(self.cast_page, '>Cinematography by', '</table>') tmp = string.split(tmp, 'href="') if len(tmp) > 1: for entry in tmp[1:]: entry = string.strip(string.replace(gutils.trim(entry, '>', '<'), '\n', '')) if entry: self.cameraman = self.cameraman + entry + ', ' if self.cameraman: self.cameraman = self.cameraman[:-2]
def get_o_title(self): # it seems, that films coming from the German branch can have their German title in the h1-name-tag; # in this case (only?), IMDB renders an additional "originalTitle"-tag. self.o_title = gutils.trim(self.page, '<div class="originalTitle">', '<span') if not self.o_title: self.o_title = gutils.regextrim(self.page, '<h1 itemprop="name"[^>]*>', ' ') if not self.o_title: self.o_title = gutils.trim(self.page, 'og:title\' content="', '"') if not self.o_title: self.o_title = re.sub(' [(].*', '', gutils.trim(self.page, '<title>', '</title>')) self.o_title = gutils.clean(re.sub('"', '', self.o_title))
def get_o_title(self): self.o_title = gutils.trim(self.page, '<p>Originaltitel: ', '</p>') if not self.o_title: self.o_title = gutils.trim(self.page, '<h1(', ')') if not self.o_title: self.o_title = gutils.trim(self.page, '<div class="teaser">', '</') if not self.o_title: if self.videopage: self.o_title = gutils.trim(self.videopage, '<p>Originaltitel: ', '</p>') if not self.o_title: self.o_title = gutils.regextrim(self.page, '<h1>', '</h1>')
def get_cameraman(self): self.cameraman = "" tmp = gutils.regextrim(self.cast_page, ">Cinematography by", "</table>") tmp = string.split(tmp, 'href="') if len(tmp) > 1: for entry in tmp[1:]: entry = string.strip(string.replace(gutils.trim(entry, ">", "<"), "\n", "")) if entry: self.cameraman = self.cameraman + entry + ", " if self.cameraman: self.cameraman = self.cameraman[:-2]
def get_studio(self): self.studio = "" tmp = gutils.regextrim(self.comp_page, 'name="production"', "</ul>") tmp = string.split(tmp, 'href="') if len(tmp) > 1: for entry in tmp[1:]: entry = string.strip(string.replace(gutils.trim(entry, ">", "<"), "\n", "")) if entry: self.studio = self.studio + entry + ", " if self.studio: self.studio = self.studio[:-2]
def get_cast(self): # Find the actors. Try to make it comma separated. self.cast = gutils.regextrim(self.page, ">Attori</font>", '(<font class="fontViolaB">|\n)') self.cast = string.replace(self.cast, "target='_self'>", "\n>") self.cast = string.replace(self.cast, "<a>", _(" as ").encode('utf8')) self.cast = string.replace(self.cast, "</tr><tr>", '\n') self.cast = string.replace(self.cast, "...vedi il resto del cast", '') self.cast = gutils.clean(self.cast) self.cast = string.replace(self.cast, " ", ' ') self.cast = re.sub('[ ]+', ' ', self.cast) self.cast = re.sub('\n[ ]+', '\n', self.cast)
def get_o_title(self): # OK v0.1 #~ self.o_title = gutils.trim(self.page, 'class="title-extra">', '<') self.o_title = gutils.trim( gutils.trim(self.page, '<h5>Alias:</h5><div class="info-content">', '</div>'), '"', '"') if not self.o_title: # same conditions as title self.o_title = gutils.regextrim(self.page, '<h1>', '([ ]|[&][#][0-9]+[;])<span') if not self.o_title: self.o_title = re.sub( ' [(].*', '', gutils.trim(self.page, '<title>', '</title>'))
def get_studio(self): self.studio = '' tmp = gutils.regextrim(self.comp_page, 'name="production"', '</ul>') tmp = string.split(tmp, 'href="') if len(tmp)>1: for entry in tmp[1:]: entry = string.strip(string.replace(gutils.trim(entry, '>', '<'), '\n', '')) if entry: self.studio = self.studio + entry + ', ' if self.studio: self.studio = self.studio[:-2]
def get_plot(self): self.plot = gutils.regextrim(self.page, '<h5>Plot:</h5>', '(</div>|<a href.*)') self.plot = self.__before_more(self.plot) elements = string.split(self.plot_page, '<p class="plotpar">') if len(elements) > 1: self.plot = self.plot + '\n\n' elements[0] = '' for element in elements: if element <> '': self.plot = self.plot + gutils.strip_tags( gutils.before(element, '</a>')) + '\n\n'