def iter_subtitles(self, language): linksresults = self.parser.select(self.document.getroot(), "a.subtitle_page_link") for link in linksresults: id = unicode(link.attrib.get("href", "").split("-p")[-1]) name = unicode(link.text_content()) tr = link.getparent().getparent().getparent() cdtd = self.parser.select(tr, "td")[4] nb_cd = int(cdtd.text) description = NotLoaded subtitle = Subtitle(id, name) subtitle.nb_cd = nb_cd subtitle.language = language subtitle.description = description yield subtitle
def iter_subtitles(self, language): linksresults = self.parser.select(self.document.getroot(), 'a.subtitle_page_link') for link in linksresults: id = unicode(link.attrib.get('href', '').split('-p')[-1]) name = unicode(link.text_content()) tr = link.getparent().getparent().getparent() cdtd = self.parser.select(tr, 'td')[4] nb_cd = int(cdtd.text) description = NotLoaded subtitle = Subtitle(id, name) subtitle.nb_cd = nb_cd subtitle.language = language subtitle.description = description yield subtitle
def iter_subtitles(self): season = '' series_name = CleanText('//div[has-class("msg")]//h1//span[@itemprop="name"]')(self.doc) # A regexp to recover the sub id from url regexp = re.compile('.*/imdbid-(?P<episode_id>\d+)$') for sub in self.doc.xpath('//table[@id="search_results"]//tbody//tr[not(contains(@class,"head"))]'): if not Attr('.', 'class', default=None)(sub): season = CleanText('.//td[1]')(sub) else: subtitle = Subtitle() episode = CleanText('.//td[1]')(sub) subtitle.name = '%s - %s - Episode %s' % (series_name, season, episode) url = Link('.//td[3]//a')(sub) subtitle.url = self.browser.absurl(url) result = regexp.match(url) subtitle.id = result.groupdict()['episode_id'] yield subtitle
def get_subtitle(self, id): href = id.split('|')[1] # we have to find the 'tr' which contains the link to this address a = self.parser.select(self.document.getroot(), 'a[href="%s"]' % href, 1) line = a.getparent().getparent().getparent().getparent().getparent() cols = self.parser.select(line, 'td') traduced_title = self.parser.select(cols[0], 'font', 1).text.lower() original_title = self.parser.select(cols[1], 'font', 1).text.lower() nb_cd = self.parser.select(cols[2], 'font', 1).text.strip() nb_cd = int(nb_cd.split()[0]) traduced_title_words = traduced_title.split() original_title_words = original_title.split() # this is to trash special spacing chars traduced_title = " ".join(traduced_title_words) original_title = " ".join(original_title_words) name = unicode('%s (%s)' % (original_title, traduced_title)) url = unicode('http://davidbillemont3.free.fr/%s' % href) subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = url.split('.')[-1] subtitle.language = unicode('fr') subtitle.nb_cd = nb_cd subtitle.description = NotAvailable return subtitle
def get_subtitle(self): filename_line = self.doc.xpath('//img[@alt="filename"]')[0].getparent().getparent() name = to_unicode(filename_line.xpath('.//td')[2].text) id = self.url.split('/')[-1].replace('.html', '').replace('subtitle-', '') url = '%s/download-%s.html' % (self.browser.BASEURL, id) amount_line, = self.doc.xpath('//tr[contains(@title, "amount")]') nb_cd = int(amount_line.xpath('.//td')[2].text) lang = url.split('-')[-1].split('.html')[0] filenames_line, = self.doc.xpath('//tr[contains(@title,"list")]') file_names = filenames_line.xpath('.//td')[2].text_content().strip().replace('.srt', '.srt\n') desc = u"files :\n" desc += file_names m = re.match('(.*?)\.(\w+)$', name) if m: name = m.group(1) ext = m.group(2) else: ext = 'zip' subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = ext subtitle.language = lang subtitle.nb_cd = nb_cd subtitle.description = desc return subtitle
def get_subtitle(self): filename_line = self.parser.select(self.document.getroot(), 'img[alt=filename]', 1).getparent().getparent() name = unicode(self.parser.select(filename_line, 'td')[2].text) id = self.browser.geturl().split('/')[-1].replace('.html', '').replace( 'subtitle-', '') url = unicode('http://%s/download-%s.html' % (self.browser.DOMAIN, id)) amount_line = self.parser.select(self.document.getroot(), 'tr[title~=amount]', 1) nb_cd = int(self.parser.select(amount_line, 'td')[2].text) lang = unicode(url.split('-')[-1].split('.html')[0]) filenames_line = self.parser.select(self.document.getroot(), 'tr[title~=list]', 1) file_names = self.parser.select( filenames_line, 'td')[2].text_content().strip().replace('.srt', '.srt\n') desc = u"files :\n" desc += file_names m = re.match('(.*?)\.(\w+)$', name) if m: name = m.group(1) ext = m.group(2) else: ext = 'zip' subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = ext subtitle.language = lang subtitle.nb_cd = nb_cd subtitle.description = desc return subtitle
def iter_subtitles(self): season = '' series_name = CleanText( '//div[has-class("msg")]//h1//span[@itemprop="name"]')(self.doc) # A regexp to recover the sub id from url regexp = re.compile('.*/imdbid-(?P<episode_id>\d+)$') for sub in self.doc.xpath( '//table[@id="search_results"]//tbody//tr[not(contains(@class,"head"))]' ): if not Attr('.', 'class', default=None)(sub): season = CleanText('.//td[1]')(sub) else: subtitle = Subtitle() episode = CleanText('.//td[1]')(sub) subtitle.name = '%s - %s - Episode %s' % (series_name, season, episode) url = Link('.//td[3]//a')(sub) subtitle.url = self.browser.absurl(url) result = regexp.match(url) subtitle.id = result.groupdict()['episode_id'] yield subtitle
def get_subtitle(self): desc = NotAvailable a = self.parser.select(self.document.getroot(), 'a#bt-dwl', 1) id = a.attrib.get('rel', '').split('/')[-1] m = re.match('Download \((\w+)\)', self.parser.tocleanstring(a)) if m: ext = m.group(1) else: ext = u'zip' url = unicode('http://www.opensubtitles.org/subtitleserve/sub/%s' % id) link = self.parser.select(self.document.getroot(), 'link[rel=bookmark]', 1) title = unicode(link.attrib.get('title', '')) nb_cd = int(title.lower().split('cd')[0].split()[-1]) lang = unicode(title.split('(')[1].split(')')[0]) file_names = self.parser.select(self.document.getroot(), "img[title~=filename]") if len(file_names) > 0: file_name = file_names[0].getparent().text_content() file_name = ' '.join(file_name.split()) desc = u'files :' for f in file_names: desc_line = f.getparent().text_content() desc += '\n'+' '.join(desc_line.split()) name = unicode('%s (%s)' % (title, file_name)) subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = ext for lshort, llong in LANGUAGE_CONV.items(): if lang == llong: lang = unicode(lshort) break subtitle.language = lang subtitle.nb_cd = nb_cd subtitle.description = desc return subtitle
def get_subtitle(self): filename_line = self.parser.select(self.document.getroot(), 'img[alt=filename]', 1).getparent().getparent() name = unicode(self.parser.select(filename_line, 'td')[2].text) id = self.browser.geturl().split('/')[-1].replace('.html', '').replace('subtitle-', '') url = unicode('http://%s/download-%s.html' % (self.browser.DOMAIN, id)) amount_line = self.parser.select(self.document.getroot(), 'tr[title~=amount]', 1) nb_cd = int(self.parser.select(amount_line, 'td')[2].text) lang = unicode(url.split('-')[-1].split('.html')[0]) filenames_line = self.parser.select(self.document.getroot(), 'tr[title~=list]', 1) file_names = self.parser.select(filenames_line, 'td')[2].text_content().strip().replace('.srt', '.srt\n') desc = u"files :\n" desc += file_names m = re.match('(.*?)\.(\w+)$', name) if m: name = m.group(1) ext = m.group(2) else: ext = 'zip' subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = ext subtitle.language = lang subtitle.nb_cd = nb_cd subtitle.description = desc return subtitle
def get_subtitle(self, id): language = NotAvailable url = NotAvailable nb_cd = NotAvailable links_info = self.parser.select(self.document.getroot(), "fieldset.information a") for link in links_info: href = link.attrib.get("href", "") if "/fr/ppodnapisi/kategorija/jezik/" in href: nlang = href.split("/")[-1] for lang, langnum in LANGUAGE_NUMBERS.items(): if str(langnum) == str(nlang): language = unicode(lang) break desc = u"" infos = self.parser.select(self.document.getroot(), "fieldset.information") for info in infos: for p in self.parser.select(info, "p"): desc += "%s\n" % (u" ".join(p.text_content().strip().split())) spans = self.parser.select(info, "span") for span in spans: if span.text is not None and "CD" in span.text: nb_cd = int(self.parser.select(span.getparent(), "span")[1].text) title = unicode(self.parser.select(self.document.getroot(), "head title", 1).text) name = title.split(" - ")[0] dllinks = self.parser.select(self.document.getroot(), "div.footer > a.download") for link in dllinks: href = link.attrib.get("href", "") if id in href: url = u"http://www.podnapisi.net%s" % href subtitle = Subtitle(id, name) subtitle.url = url subtitle.language = language subtitle.nb_cd = nb_cd subtitle.description = desc return subtitle
def get_subtitle(self, id): language = NotAvailable url = NotAvailable nb_cd = NotAvailable links_info = self.parser.select(self.document.getroot(), 'fieldset.information a') for link in links_info: href = link.attrib.get('href', '') if '/fr/ppodnapisi/kategorija/jezik/' in href: nlang = href.split('/')[-1] for lang, langnum in LANGUAGE_NUMBERS.items(): if str(langnum) == str(nlang): language = unicode(lang) break desc = u'' infos = self.parser.select(self.document.getroot(), 'fieldset.information') for info in infos: for p in self.parser.select(info, 'p'): desc += '%s\n' % (u' '.join(p.text_content().strip().split())) spans = self.parser.select(info, 'span') for span in spans: if span.text is not None and 'CD' in span.text: nb_cd = int(self.parser.select(span.getparent(), 'span')[1].text) title = unicode(self.parser.select(self.document.getroot(), 'head title', 1).text) name = title.split(' - ')[0] dllinks = self.parser.select(self.document.getroot(), 'div.footer > a.download') for link in dllinks: href = link.attrib.get('href', '') if id in href: url = u'http://www.podnapisi.net%s' % href subtitle = Subtitle(id, name) subtitle.url = url subtitle.language = language subtitle.nb_cd = nb_cd subtitle.description = desc return subtitle
def get_subtitle_from_line(self, line): cells = self.parser.select(line, 'td') if len(cells) > 0: links = self.parser.select(line, 'a') a = links[0] name = u" ".join(a.text.strip().split()) first_cell = cells[0] spanlist = self.parser.select(first_cell, 'span') if len(spanlist) > 0: long_name = spanlist[0].attrib.get('title', '') else: texts = first_cell.itertext() long_name = texts.next() long_name = texts.next() if "Download at 25" in long_name: long_name = "---" name = "%s (%s)" % (name, long_name) second_cell = cells[1] link = self.parser.select(second_cell, 'a', 1) lang = link.attrib.get('href', '').split('/')[-1].split('-')[-1] for lshort, llong in LANGUAGE_CONV.items(): if lang == llong: lang = unicode(lshort) break nb_cd = int(cells[2].text.strip().lower().replace('cd', '')) cell_dl = cells[4] href = self.parser.select(cell_dl, 'a', 1).attrib.get('href', '') url = unicode('http://www.opensubtitles.org%s' % href) id = href.split('/')[-1] subtitle = Subtitle(id, name) subtitle.url = url subtitle.language = lang subtitle.nb_cd = nb_cd subtitle.description = NotLoaded return subtitle
def iter_subtitles(self, language, pattern): pattern = pattern.strip().replace('+', ' ').lower() pattern_words = pattern.split() tab = self.parser.select(self.document.getroot(), 'table[bordercolor="#B8C0B2"]') if len(tab) == 0: tab = self.parser.select(self.document.getroot(), 'table[bordercolordark="#B8C0B2"]') if len(tab) == 0: return # some results of freefind point on useless pages if tab[0].attrib.get('width', '') != '100%': return for line in tab[0].getiterator('tr'): cols = self.parser.select(line, 'td') traduced_title = self.parser.select(cols[0], 'font', 1).text.lower() original_title = self.parser.select(cols[1], 'font', 1).text.lower() traduced_title_words = traduced_title.split() original_title_words = original_title.split() # if the pattern is one word and in the title OR if the # intersection between pattern and the title is at least 2 words if (len(pattern_words) == 1 and pattern in traduced_title_words) or\ (len(pattern_words) == 1 and pattern in original_title_words) or\ (len(list(set(pattern_words) & set(traduced_title_words))) > 1) or\ (len(list(set(pattern_words) & set(original_title_words))) > 1): # this is to trash special spacing chars traduced_title = " ".join(traduced_title_words) original_title = " ".join(original_title_words) nb_cd = self.parser.select(cols[2], 'font', 1).text.strip() nb_cd = int(nb_cd.strip(' CD')) name = unicode('%s (%s)' % (original_title, traduced_title)) href = self.parser.select(cols[3], 'a', 1).attrib.get('href', '') url = unicode('http://davidbillemont3.free.fr/%s' % href) id = unicode('%s|%s' % (self.browser.geturl().split('/')[-1], href)) subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = url.split('.')[-1] subtitle.language = unicode('fr') subtitle.nb_cd = nb_cd subtitle.description = NotAvailable yield subtitle
def get_subtitle(self, id=None): subtitle = Subtitle() subtitle.description = CleanText( './/fieldset/span[@itemprop="description"]')(self.doc) if id: subtitle.id = id else: regexp = re.compile( 'https://www.opensubtitles.org/en/subtitles/(?P<id>\d+)/.*$') result = regexp.match(self.url) subtitle.id = result.groupdict()['id'] subtitle.name = CleanText('.//div//div//h2')(self.doc) subtitle.url = self.url return subtitle
def get_subtitle(self, id=None): subtitle = Subtitle() subtitle.description = CleanText('.//fieldset/span[@itemprop="description"]')(self.doc) if id: subtitle.id = id else: regexp = re.compile('https://www.opensubtitles.org/en/subtitles/(?P<id>\d+)/.*$') result = regexp.match(self.url) subtitle.id = result.groupdict()['id'] subtitle.name = CleanText('.//div//div//h2')(self.doc) subtitle.url = self.url return subtitle
def get_subtitle(self, id): language = NotAvailable url = NotAvailable nb_cd = NotAvailable links_info = self.parser.select(self.document.getroot(), 'fieldset.information a') for link in links_info: href = link.attrib.get('href', '') if '/fr/ppodnapisi/kategorija/jezik/' in href: nlang = href.split('/')[-1] for lang, langnum in LANGUAGE_NUMBERS.items(): if str(langnum) == str(nlang): language = unicode(lang) break desc = u'' infos = self.parser.select(self.document.getroot(), 'fieldset.information') for info in infos: for p in self.parser.select(info, 'p'): desc += '%s\n' % (u' '.join(p.text_content().strip().split())) spans = self.parser.select(info, 'span') for span in spans: if span.text is not None and 'CD' in span.text: nb_cd = int( self.parser.select(span.getparent(), 'span')[1].text) title = unicode( self.parser.select(self.document.getroot(), 'head title', 1).text) name = title.split(' - ')[0] dllinks = self.parser.select(self.document.getroot(), 'div.footer > a.download') for link in dllinks: href = link.attrib.get('href', '') if id in href: url = u'http://www.podnapisi.net%s' % href subtitle = Subtitle(id, name) subtitle.url = url subtitle.language = language subtitle.nb_cd = nb_cd subtitle.description = desc return subtitle
def get_subtitle(self): desc = NotAvailable a = self.parser.select(self.document.getroot(), 'a#bt-dwl-bt', 1) id = a.attrib.get('data-product-id', '') m = re.match('Download \((\w+)\)', self.parser.tocleanstring(a)) if m: ext = m.group(1) else: ext = u'zip' url = unicode('http://www.opensubtitles.org/en/subtitleserve/sub/%s' % id) link = self.parser.select(self.document.getroot(), 'link[rel=bookmark]', 1) title = unicode(link.attrib.get('title', '')) nb_cd = int(title.lower().split('cd')[0].split()[-1]) lang = unicode(title.split('(')[1].split(')')[0]) file_names = self.parser.select(self.document.getroot(), "img[title~=filename]") if len(file_names) > 0: file_name = file_names[0].getparent().text_content() file_name = ' '.join(file_name.split()) desc = u'files :' for f in file_names: desc_line = f.getparent().text_content() desc += '\n' + ' '.join(desc_line.split()) name = unicode('%s (%s)' % (title, file_name)) subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = ext for lshort, llong in LANGUAGE_CONV.items(): if lang == llong: lang = unicode(lshort) break subtitle.language = lang subtitle.nb_cd = nb_cd subtitle.description = desc return subtitle