def get_subtitle(self): desc = NotAvailable a = self.parser.select(self.document.getroot(), 'a#bt-dwl', 1) id = a.attrib.get('rel', '').split('/')[-1] m = re.match('Download \((\w+)\)', self.parser.tocleanstring(a)) if m: ext = m.group(1) else: ext = u'zip' url = unicode('http://www.opensubtitles.org/subtitleserve/sub/%s' % id) link = self.parser.select(self.document.getroot(), 'link[rel=bookmark]', 1) title = unicode(link.attrib.get('title', '')) nb_cd = int(title.lower().split('cd')[0].split()[-1]) lang = unicode(title.split('(')[1].split(')')[0]) file_names = self.parser.select(self.document.getroot(), "img[title~=filename]") if len(file_names) > 0: file_name = file_names[0].getparent().text_content() file_name = ' '.join(file_name.split()) desc = u'files :' for f in file_names: desc_line = f.getparent().text_content() desc += '\n'+' '.join(desc_line.split()) name = unicode('%s (%s)' % (title, file_name)) subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = ext for lshort, llong in LANGUAGE_CONV.items(): if lang == llong: lang = unicode(lshort) break subtitle.language = lang subtitle.nb_cd = nb_cd subtitle.description = desc return subtitle
def get_subtitle(self): filename_line = self.parser.select(self.document.getroot(), 'img[alt=filename]', 1).getparent().getparent() name = unicode(self.parser.select(filename_line, 'td')[2].text) id = self.browser.geturl().split('/')[-1].replace('.html', '').replace( 'subtitle-', '') url = unicode('http://%s/download-%s.html' % (self.browser.DOMAIN, id)) amount_line = self.parser.select(self.document.getroot(), 'tr[title~=amount]', 1) nb_cd = int(self.parser.select(amount_line, 'td')[2].text) lang = unicode(url.split('-')[-1].split('.html')[0]) filenames_line = self.parser.select(self.document.getroot(), 'tr[title~=list]', 1) file_names = self.parser.select( filenames_line, 'td')[2].text_content().strip().replace('.srt', '.srt\n') desc = u"files :\n" desc += file_names m = re.match('(.*?)\.(\w+)$', name) if m: name = m.group(1) ext = m.group(2) else: ext = 'zip' subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = ext subtitle.language = lang subtitle.nb_cd = nb_cd subtitle.description = desc return subtitle
def get_subtitle(self, id): href = id.split('|')[1] # we have to find the 'tr' which contains the link to this address a = self.parser.select(self.document.getroot(), 'a[href="%s"]' % href, 1) line = a.getparent().getparent().getparent().getparent().getparent() cols = self.parser.select(line, 'td') traduced_title = self.parser.select(cols[0], 'font', 1).text.lower() original_title = self.parser.select(cols[1], 'font', 1).text.lower() nb_cd = self.parser.select(cols[2], 'font', 1).text.strip() nb_cd = int(nb_cd.split()[0]) traduced_title_words = traduced_title.split() original_title_words = original_title.split() # this is to trash special spacing chars traduced_title = " ".join(traduced_title_words) original_title = " ".join(original_title_words) name = unicode('%s (%s)' % (original_title, traduced_title)) url = unicode('http://davidbillemont3.free.fr/%s' % href) subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = url.split('.')[-1] subtitle.language = unicode('fr') subtitle.nb_cd = nb_cd subtitle.description = NotAvailable return subtitle
def get_subtitle(self): filename_line = self.parser.select(self.document.getroot(), 'img[alt=filename]', 1).getparent().getparent() name = unicode(self.parser.select(filename_line, 'td')[2].text) id = self.browser.geturl().split('/')[-1].replace('.html', '').replace('subtitle-', '') url = unicode('http://%s/download-%s.html' % (self.browser.DOMAIN, id)) amount_line = self.parser.select(self.document.getroot(), 'tr[title~=amount]', 1) nb_cd = int(self.parser.select(amount_line, 'td')[2].text) lang = unicode(url.split('-')[-1].split('.html')[0]) filenames_line = self.parser.select(self.document.getroot(), 'tr[title~=list]', 1) file_names = self.parser.select(filenames_line, 'td')[2].text_content().strip().replace('.srt', '.srt\n') desc = u"files :\n" desc += file_names m = re.match('(.*?)\.(\w+)$', name) if m: name = m.group(1) ext = m.group(2) else: ext = 'zip' subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = ext subtitle.language = lang subtitle.nb_cd = nb_cd subtitle.description = desc return subtitle
def get_subtitle(self): filename_line = self.doc.xpath('//img[@alt="filename"]')[0].getparent().getparent() name = to_unicode(filename_line.xpath('.//td')[2].text) id = self.url.split('/')[-1].replace('.html', '').replace('subtitle-', '') url = '%s/download-%s.html' % (self.browser.BASEURL, id) amount_line, = self.doc.xpath('//tr[contains(@title, "amount")]') nb_cd = int(amount_line.xpath('.//td')[2].text) lang = url.split('-')[-1].split('.html')[0] filenames_line, = self.doc.xpath('//tr[contains(@title,"list")]') file_names = filenames_line.xpath('.//td')[2].text_content().strip().replace('.srt', '.srt\n') desc = u"files :\n" desc += file_names m = re.match('(.*?)\.(\w+)$', name) if m: name = m.group(1) ext = m.group(2) else: ext = 'zip' subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = ext subtitle.language = lang subtitle.nb_cd = nb_cd subtitle.description = desc return subtitle
def get_subtitle(self, id): href = id.split('|')[1] # we have to find the 'tr' which contains the link to this address a = self.parser.select(self.document.getroot(), 'a[href="%s"]' % href, 1) line = a.getparent().getparent().getparent().getparent().getparent() cols = self.parser.select(line, 'td') traduced_title = self.parser.select(cols[0], 'font', 1).text.lower() original_title = self.parser.select(cols[1], 'font', 1).text.lower() nb_cd = self.parser.select(cols[2], 'font', 1).text.strip() nb_cd = int(nb_cd.split()[0]) traduced_title_words = traduced_title.split() original_title_words = original_title.split() # this is to trash special spacing chars traduced_title = " ".join(traduced_title_words) original_title = " ".join(original_title_words) name = unicode('%s (%s)' % (original_title, traduced_title)) url = unicode('http://davidbillemont3.free.fr/%s' % href) subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = url.split('.')[-1] subtitle.language = unicode('fr') subtitle.nb_cd = nb_cd subtitle.description = NotAvailable return subtitle
def iter_subtitles(self, language, pattern): pattern = pattern.strip().replace('+', ' ').lower() pattern_words = pattern.split() tab = self.parser.select(self.document.getroot(), 'table[bordercolor="#B8C0B2"]') if len(tab) == 0: tab = self.parser.select(self.document.getroot(), 'table[bordercolordark="#B8C0B2"]') if len(tab) == 0: return # some results of freefind point on useless pages if tab[0].attrib.get('width', '') != '100%': return for line in tab[0].getiterator('tr'): cols = self.parser.select(line, 'td') traduced_title = self.parser.select(cols[0], 'font', 1).text.lower() original_title = self.parser.select(cols[1], 'font', 1).text.lower() traduced_title_words = traduced_title.split() original_title_words = original_title.split() # if the pattern is one word and in the title OR if the # intersection between pattern and the title is at least 2 words if (len(pattern_words) == 1 and pattern in traduced_title_words) or\ (len(pattern_words) == 1 and pattern in original_title_words) or\ (len(list(set(pattern_words) & set(traduced_title_words))) > 1) or\ (len(list(set(pattern_words) & set(original_title_words))) > 1): # this is to trash special spacing chars traduced_title = " ".join(traduced_title_words) original_title = " ".join(original_title_words) nb_cd = self.parser.select(cols[2], 'font', 1).text.strip() nb_cd = int(nb_cd.strip(' CD')) name = unicode('%s (%s)' % (original_title, traduced_title)) href = self.parser.select(cols[3], 'a', 1).attrib.get('href', '') url = unicode('http://davidbillemont3.free.fr/%s' % href) id = unicode('%s|%s' % (self.browser.geturl().split('/')[-1], href)) subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = url.split('.')[-1] subtitle.language = unicode('fr') subtitle.nb_cd = nb_cd subtitle.description = NotAvailable yield subtitle
def iter_subtitles(self, language, pattern): pattern = pattern.strip().replace('+', ' ').lower() pattern_words = pattern.split() tab = self.parser.select(self.document.getroot(), 'table[bordercolor="#B8C0B2"]') if len(tab) == 0: tab = self.parser.select(self.document.getroot(), 'table[bordercolordark="#B8C0B2"]') if len(tab) == 0: return # some results of freefind point on useless pages if tab[0].attrib.get('width', '') != '100%': return for line in tab[0].getiterator('tr'): cols = self.parser.select(line, 'td') traduced_title = self.parser.select(cols[0], 'font', 1).text.lower() original_title = self.parser.select(cols[1], 'font', 1).text.lower() traduced_title_words = traduced_title.split() original_title_words = original_title.split() # if the pattern is one word and in the title OR if the # intersection between pattern and the title is at least 2 words if (len(pattern_words) == 1 and pattern in traduced_title_words) or\ (len(pattern_words) == 1 and pattern in original_title_words) or\ (len(list(set(pattern_words) & set(traduced_title_words))) > 1) or\ (len(list(set(pattern_words) & set(original_title_words))) > 1): # this is to trash special spacing chars traduced_title = " ".join(traduced_title_words) original_title = " ".join(original_title_words) nb_cd = self.parser.select(cols[2], 'font', 1).text.strip() nb_cd = int(nb_cd.strip(' CD')) name = unicode('%s (%s)' % (original_title, traduced_title)) href = self.parser.select(cols[3], 'a', 1).attrib.get('href', '') url = unicode('http://davidbillemont3.free.fr/%s' % href) id = unicode('%s|%s' % (self.browser.geturl().split('/')[-1], href)) subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = url.split('.')[-1] subtitle.language = unicode('fr') subtitle.nb_cd = nb_cd subtitle.description = NotAvailable yield subtitle
def get_subtitle(self): desc = NotAvailable a = self.parser.select(self.document.getroot(), 'a#bt-dwl-bt', 1) id = a.attrib.get('data-product-id', '') m = re.match('Download \((\w+)\)', self.parser.tocleanstring(a)) if m: ext = m.group(1) else: ext = u'zip' url = unicode('http://www.opensubtitles.org/en/subtitleserve/sub/%s' % id) link = self.parser.select(self.document.getroot(), 'link[rel=bookmark]', 1) title = unicode(link.attrib.get('title', '')) nb_cd = int(title.lower().split('cd')[0].split()[-1]) lang = unicode(title.split('(')[1].split(')')[0]) file_names = self.parser.select(self.document.getroot(), "img[title~=filename]") if len(file_names) > 0: file_name = file_names[0].getparent().text_content() file_name = ' '.join(file_name.split()) desc = u'files :' for f in file_names: desc_line = f.getparent().text_content() desc += '\n' + ' '.join(desc_line.split()) name = unicode('%s (%s)' % (title, file_name)) subtitle = Subtitle(id, name) subtitle.url = url subtitle.ext = ext for lshort, llong in LANGUAGE_CONV.items(): if lang == llong: lang = unicode(lshort) break subtitle.language = lang subtitle.nb_cd = nb_cd subtitle.description = desc return subtitle