def iter_torrents(self): for div in self.parser.select(self.document.getroot(),'div.list_tor'): name = NotAvailable size = NotAvailable seeders = NotAvailable leechers = NotAvailable right_div = self.parser.select(div,'div.list_tor_right',1) try: seeders = int(self.parser.select(right_div,'b.green',1).text) except ValueError: seeders = 0 try: leechers = int(self.parser.select(right_div,'b.red',1).text) except ValueError: leechers = 0 sizep = self.parser.select(right_div,'p')[0] sizespan = self.parser.select(sizep,'span')[0] nsize = float(sizespan.text_content().split(':')[1].split()[0]) usize = sizespan.text_content().split()[-1].upper() size = get_bytes_size(nsize,usize) a = self.parser.select(div,'a.list_tor_title',1) href = a.attrib.get('href','') name = unicode(a.text_content()) id = unicode(href.strip('/').split('.html')[0]) torrent = Torrent(id,name) torrent.url = NotLoaded torrent.filename = id torrent.magnet = NotLoaded torrent.size = size torrent.seeders = seeders torrent.leechers = leechers torrent.description = NotLoaded torrent.files = NotLoaded yield torrent
def iter_torrents(self): for tr in self.document.getiterator('tr'): if tr.attrib.get('class', '') == 'hlRow': # sometimes the first tr also has the attribute hlRow # i use that to ditinct it from the others if 'onmouseout' in tr.attrib: atitle = tr.getchildren()[2].getchildren()[1] title = atitle.text if not title: title = '' for bold in atitle.getchildren(): if bold.text: title += bold.text if bold.tail: title += bold.tail idt = tr.getchildren()[2].getchildren()[0].attrib.get('href', '') idt = idt.split('/')[2] size = tr.getchildren()[3].text u = size[-2:] size = float(size[:-3]) seed = tr.getchildren()[4].text leech = tr.getchildren()[5].text url = 'https://isohunt.com/download/%s/mon_joli_torrent.torrent' % idt torrent = Torrent(idt, title) torrent.url = url torrent.size = get_bytes_size(size, u) torrent.seeders = int(seed) torrent.leechers = int(leech) yield torrent
def get_torrent(self, id): seed = 0 leech = 0 description = NotAvailable url = NotAvailable title = NotAvailable for div in self.document.getiterator('div'): if div.attrib.get('id', '') == 'desc': try: description = div.text_content().strip() except UnicodeDecodeError: description = 'Description with invalid UTF-8.' elif div.attrib.get('class', '') == 'seedBlock': if div.getchildren()[1].text is not None: seed = int(div.getchildren()[1].text) else: seed = 0 elif div.attrib.get('class', '') == 'leechBlock': if div.getchildren()[1].text is not None: leech = int(div.getchildren()[1].text) else: leech = 0 title = self.parser.select(self.document.getroot(), 'h1.torrentName span', 1) title = title.text for a in self.document.getiterator('a'): if ('Download' in a.attrib.get('title', '')) \ and ('torrent file' in a.attrib.get('title', '')): url = a.attrib.get('href', '') size = 0 u = '' for span in self.document.getiterator('span'): # sometimes there are others span, this is not so sure but the size of the children list # is enough to know if this is the right span if (span.attrib.get('class', '') == 'folder' \ or span.attrib.get('class', '') == 'folderopen') \ and len(span.getchildren()) > 2: size = span.getchildren()[1].tail u = span.getchildren()[2].text size = float(size.split(': ')[1].replace(',', '.')) files = [] for td in self.document.getiterator('td'): if td.attrib.get('class', '') == 'torFileName': files.append(td.text) torrent = Torrent(id, title) torrent.url = url if torrent.url: torrent.filename = parse_qs(urlsplit(url).query).get( 'title', [None])[0] torrent.size = get_bytes_size(size, u) torrent.seeders = int(seed) torrent.leechers = int(leech) torrent.description = description torrent.files = files return torrent
def get_torrent(self, id): seed = 0 leech = 0 description = NotAvailable url = NotAvailable title = NotAvailable for div in self.document.getiterator('div'): if div.attrib.get('id', '') == 'desc': try: description = div.text_content().strip() except UnicodeDecodeError: description = 'Description with invalid UTF-8.' elif div.attrib.get('class', '') == 'seedBlock': if div.getchildren()[1].text is not None: seed = int(div.getchildren()[1].text) else: seed = 0 elif div.attrib.get('class', '') == 'leechBlock': if div.getchildren()[1].text is not None: leech = int(div.getchildren()[1].text) else: leech = 0 title = self.parser.select(self.document.getroot(), 'h1.torrentName span', 1) title = title.text for a in self.document.getiterator('a'): if ('Download' in a.attrib.get('title', '')) \ and ('torrent file' in a.attrib.get('title', '')): url = a.attrib.get('href', '') size = 0 u = '' for span in self.document.getiterator('span'): # sometimes there are others span, this is not so sure but the size of the children list # is enough to know if this is the right span if (span.attrib.get('class', '') == 'folder' \ or span.attrib.get('class', '') == 'folderopen') \ and len(span.getchildren()) > 2: size = span.getchildren()[1].tail u = span.getchildren()[2].text size = float(size.split(': ')[1].replace(',', '.')) files = [] for td in self.document.getiterator('td'): if td.attrib.get('class', '') == 'torFileName': files.append(td.text) torrent = Torrent(id, title) torrent.url = url if torrent.url: torrent.filename = parse_qs(urlsplit(url).query).get('title', [None])[0] torrent.size = get_bytes_size(size, u) torrent.seeders = int(seed) torrent.leechers = int(leech) torrent.description = description torrent.files = files return torrent
def iter_torrents(self): for tr in self.document.getiterator('tr'): if tr.attrib.get('class', '') == 'odd' or tr.attrib.get( 'class', '') == ' even': magnet = NotAvailable url = NotAvailable if not 'id' in tr.attrib: continue title = tr.getchildren()[0].getchildren()[1].getchildren( )[1].text if not title: title = u'' else: title = unicode(title) for red in tr.getchildren()[0].getchildren()[1].getchildren( )[1].getchildren(): title += red.text_content() idt = tr.getchildren()[0].getchildren()[1].getchildren()[1].attrib.get('href', '').replace('/', '') \ .replace('.html', '') # look for url for a in self.parser.select(tr, 'div.iaconbox a'): href = a.attrib.get('href', '') if href.startswith('magnet'): magnet = unicode(href) elif href.startswith('http'): url = unicode(href) elif href.startswith('//'): url = u'https:%s' % href size = tr.getchildren()[1].text u = tr.getchildren()[1].getchildren()[0].text size = size = size.replace(',', '.') size = float(size) seed = tr.getchildren()[4].text leech = tr.getchildren()[5].text torrent = Torrent(idt, title) torrent.url = url torrent.magnet = magnet torrent.description = NotLoaded torrent.files = NotLoaded torrent.filename = unicode( parse_qs(urlsplit(url).query).get('title', [None])[0]) torrent.size = get_bytes_size(size, u) torrent.seeders = int(seed) torrent.leechers = int(leech) yield torrent
def get_torrent(self): seed = 0 leech = 0 description = NotAvailable url = NotAvailable magnet = NotAvailable title = NotAvailable id = unicode(self.browser.geturl().split('.html')[0].split('/')[-1]) div = self.parser.select(self.document.getroot(),'div#middle_content',1) title = u'%s'%self.parser.select(self.document.getroot(),'div#middle_content > h1',1).text slblock_values = self.parser.select(div,'div.sl_block b') if len(slblock_values) >= 2: seed = slblock_values[0].text leech = slblock_values[1].text href_t = self.parser.select(div,'a.down',1).attrib.get('href','') url = u'http://%s%s'%(self.browser.DOMAIN,href_t) magnet = unicode(self.parser.select(div,'a.magnet',1).attrib.get('href','')) divtabs = self.parser.select(div,'div#tabs',1) files_div = self.parser.select(divtabs,'div.body > div.doubleblock > div.leftblock') files = [] if len(files_div) > 0: size_text = self.parser.select(files_div,'h5',1).text for b in self.parser.select(files_div,'b'): div = b.getparent() files.append(div.text_content()) else: size_text = self.parser.select(divtabs,'h5',1).text_content() size_text = size_text.split('(')[1].split(')')[0].strip() size = float(size_text.split(',')[1].strip(string.letters)) u = size_text.split(',')[1].strip().translate(None,string.digits).strip('.').strip().upper() div_desc = self.parser.select(divtabs,'div#descriptionContent') if len(div_desc) > 0: description = unicode(div_desc[0].text_content()) torrent = Torrent(id, title) torrent.url = url torrent.filename = id torrent.magnet = magnet torrent.size = get_bytes_size(size, u) torrent.seeders = int(seed) torrent.leechers = int(leech) torrent.description = description torrent.files = files return torrent
def get_torrent(self, id): for div in self.document.getiterator('div'): if div.attrib.get('id','') == 'title': title = div.text.strip() elif div.attrib.get('class','') == 'download': url = div.getchildren()[0].attrib.get('href','') elif div.attrib.get('id','') == 'details': size = float(div.getchildren()[0].getchildren()[5].text.split('(')[1].split('Bytes')[0]) if len(div.getchildren()) > 1 \ and div.getchildren()[1].attrib.get('class','') == 'col2' : seed = div.getchildren()[1].getchildren()[7].text leech = div.getchildren()[1].getchildren()[9].text else: seed = div.getchildren()[0].getchildren()[24].text leech = div.getchildren()[0].getchildren()[26].text elif div.attrib.get('class','') == 'nfo': description = div.getchildren()[0].text torrent = Torrent(id, title) torrent.url = url torrent.size = size torrent.seeders = int(seed) torrent.leechers = int(leech) torrent.description = description torrent.files = ['NYI'] return torrent
def get_torrent(self, id): title = '' url = 'https://isohunt.com/download/%s/%s.torrent' % (id, id) for a in self.document.getiterator('a'): if 'Search more torrents of' in a.attrib.get('title', ''): title = a.tail seed = -1 leech = -1 tip_id = "none" for span in self.document.getiterator('span'): if span.attrib.get('style', '') == 'color:green;' and ('ShowTip' in span.attrib.get('onmouseover', '')): seed = span.tail.split(' ')[1] tip_id = span.attrib.get('onmouseover', '').split("'")[1] for div in self.document.getiterator('div'): # find the corresponding super tip which appears on super mouse hover! if div.attrib.get('class', '') == 'dirs ydsf' and tip_id in div.attrib.get('id', ''): leech = div.getchildren()[0].getchildren()[1].tail.split(' ')[2] # the <b> with the size in it doesn't have a distinction # have to get it by higher elif div.attrib.get('id', '') == 'torrent_details': size = div.getchildren()[6].getchildren()[0].getchildren()[0].text u = size[-2:] size = float(size[:-3]) # files and description (uploader's comment) description = 'No description' files = [] count_p_found = 0 for p in self.document.getiterator('p'): if p.attrib.get('style', '') == 'line-height:1.2em;margin-top:1.8em': count_p_found += 1 if count_p_found == 1: if p.getchildren()[1].tail != None: description = p.getchildren()[1].tail if count_p_found == 2: if p.getchildren()[0].text == 'Directory:': files.append(p.getchildren()[0].tail.strip() + '/') else: files.append(p.getchildren()[0].tail.strip()) for td in self.document.getiterator('td'): if td.attrib.get('class', '') == 'fileRows': filename = td.text for slash in td.getchildren(): filename += '/' filename += slash.tail files.append(filename) #--------------------------TODO torrent = Torrent(id, title) torrent.url = url torrent.size = get_bytes_size(size, u) torrent.seeders = int(seed) torrent.leechers = int(leech) torrent.description = description torrent.files = files return torrent
def get_torrent(self, id): url = NotAvailable magnet = NotAvailable for div in self.document.getiterator('div'): if div.attrib.get('id', '') == 'title': title = unicode(unescape(div.text.strip())) elif div.attrib.get('class', '') == 'download': for link in self.parser.select(div, 'a'): href = link.attrib.get('href', '') # https fails on the download server, so strip it if href.startswith('https://'): href = href.replace('https://', 'http://', 1) if href.startswith('magnet:'): magnet = unicode(href) elif len(href): url = unicode(href) elif div.attrib.get('id', '') == 'details': size = float(div.getchildren()[0].getchildren()[5].text.split( '(')[1].split('Bytes')[0]) if len(div.getchildren()) > 1 \ and div.getchildren()[1].attrib.get('class', '') == 'col2': child_to_explore = div.getchildren()[1] else: child_to_explore = div.getchildren()[0] prev_child_txt = "none" seed = "-1" leech = "-1" for ch in child_to_explore.getchildren(): if prev_child_txt == "Seeders:": seed = ch.text if prev_child_txt == "Leechers:": leech = ch.text prev_child_txt = ch.text elif div.attrib.get('class', '') == 'nfo': description = unicode( div.getchildren()[0].text_content().strip()) torrent = Torrent(id, title) torrent.url = url or NotAvailable torrent.magnet = magnet torrent.size = size torrent.seeders = int(seed) torrent.leechers = int(leech) torrent.description = description torrent.files = NotAvailable return torrent
def iter_torrents(self): for tr in self.document.getiterator('tr'): if tr.attrib.get('class', '') == 'hlRow': # sometimes the first tr also has the attribute hlRow # i use that to ditinct it from the others if 'onmouseout' in tr.attrib: atitle = tr.getchildren()[2].getchildren()[1] title = atitle.text if not title: title = '' for bold in atitle.getchildren(): if bold.text: title += bold.text if bold.tail: title += bold.tail idt = tr.getchildren()[2].getchildren()[0].attrib.get( 'href', '') idt = idt.split('/')[2] size = tr.getchildren()[3].text u = size[-2:] size = float(size[:-3]) seed = tr.getchildren()[4].text leech = tr.getchildren()[5].text url = 'https://isohunt.com/download/%s/mon_joli_torrent.torrent' % idt torrent = Torrent(idt, title) torrent.url = url torrent.size = get_bytes_size(size, u) torrent.seeders = int(seed) torrent.leechers = int(leech) yield torrent
def iter_torrents(self): for tr in self.document.getiterator('tr'): if tr.attrib.get('class', '') == 'odd' or tr.attrib.get('class', '') == ' even': if not 'id' in tr.attrib: continue title = tr.getchildren()[0].getchildren()[1].getchildren()[1].text if not title: title = '' for red in tr.getchildren()[0].getchildren()[1].getchildren()[1].getchildren(): title += red.text_content() idt = tr.getchildren()[0].getchildren()[1].getchildren()[1].attrib.get('href', '').replace('/', '') \ .replace('.html', '') # look for url for a in tr.getchildren()[0].getiterator('a'): if '.torrent' in a.attrib.get('href', ''): url = a.attrib['href'] size = tr.getchildren()[1].text u = tr.getchildren()[1].getchildren()[0].text size = size = size.replace(',', '.') size = float(size) seed = tr.getchildren()[4].text leech = tr.getchildren()[5].text torrent = Torrent(idt, title) torrent.url = url torrent.filename = parse_qs(urlsplit(url).query).get('title', [None])[0] torrent.size = get_bytes_size(size, u) torrent.seeders = int(seed) torrent.leechers = int(leech) yield torrent
def iter_torrents(self): table = self.parser.select(self.document.getroot(), 'table#searchResult', 1) for tr in table.getiterator('tr'): if tr.get('class', '') != "header": td = tr.getchildren()[1] div = td.getchildren()[0] link = div.find('a').attrib['href'] title = div.find('a').text idt = link.split('/')[2] a = td.getchildren()[1] url = a.attrib['href'] size = td.find('font').text.split(',')[1].strip() u = size.split(' ')[1].split(u'\xa0')[1].replace('i', '') size = size.split(' ')[1].split(u'\xa0')[0] seed = tr.getchildren()[2].text leech = tr.getchildren()[3].text torrent = Torrent(idt, title) torrent.url = url torrent.size = self.unit(float(size), u) torrent.seeders = int(seed) torrent.leechers = int(leech) yield torrent
def iter_torrents(self): for tr in self.document.getiterator('tr'): if tr.attrib.get('class', '') == 'odd' or tr.attrib.get('class', '') == ' even': magnet = NotAvailable url = NotAvailable if not 'id' in tr.attrib: continue title = tr.getchildren()[0].getchildren()[1].getchildren()[1].text if not title: title = u'' else: title = unicode(title) for red in tr.getchildren()[0].getchildren()[1].getchildren()[1].getchildren(): title += red.text_content() idt = tr.getchildren()[0].getchildren()[1].getchildren()[1].attrib.get('href', '').replace('/', '') \ .replace('.html', '') # look for url for a in self.parser.select(tr, 'div.iaconbox a'): href = a.attrib.get('href', '') if href.startswith('magnet'): magnet = unicode(href) elif href.startswith('http'): url = unicode(href) elif href.startswith('//'): url = u'https:%s' % href size = tr.getchildren()[1].text u = tr.getchildren()[1].getchildren()[0].text size = size = size.replace(',', '.') size = float(size) seed = tr.getchildren()[4].text leech = tr.getchildren()[5].text torrent = Torrent(idt, title) torrent.url = url torrent.magnet = magnet torrent.description = NotLoaded torrent.files = NotLoaded torrent.filename = unicode(parse_qs(urlsplit(url).query).get('title', [None])[0]) torrent.size = get_bytes_size(size, u) torrent.seeders = int(seed) torrent.leechers = int(leech) yield torrent
def iter_torrents(self): try: table = self.document.getroot().cssselect('table.torrent_name_tbl') except BrokenPageError: return for i in range(0, len(table), 2): # Title title = table[i].cssselect('td.torrent_name a')[0] name = unicode(title.text) url = unicode(title.attrib['href']) # Other elems elems = table[i + 1].cssselect('td') magnet = unicode(elems[0].cssselect('a')[0].attrib['href']) query = urlparse(magnet).query # xt=urn:btih:<...>&dn=<...> btih = parse_qs(query)['xt'][0] # urn:btih:<...> ih = btih.split(':')[-1] value, unit = elems[2].cssselect('span.attr_val')[0].text.split() valueago, valueunit, _ = elems[5].cssselect( 'span.attr_val')[0].text.split() delta = timedelta(**{valueunit: float(valueago)}) date = datetime.now() - delta url = unicode('https://btdigg.org/search?info_hash=%s' % ih) torrent = Torrent(ih, name) torrent.url = url torrent.size = get_bytes_size(float(value), unit) torrent.magnet = magnet torrent.seeders = NotAvailable torrent.leechers = NotAvailable torrent.description = NotAvailable torrent.files = NotAvailable torrent.date = date yield torrent
def get_torrent(self, id): trs = self.document.getroot().cssselect('table.torrent_info_tbl tr') # magnet download = trs[2].cssselect('td a')[0] if download.attrib['href'].startswith('magnet:'): magnet = unicode(download.attrib['href']) query = urlparse(magnet).query # xt=urn:btih:<...>&dn=<...> btih = parse_qs(query)['xt'][0] # urn:btih:<...> ih = btih.split(':')[-1] name = unicode(trs[3].cssselect('td')[1].text) value, unit = trs[5].cssselect('td')[1].text.split() valueago, valueunit, _ = trs[6].cssselect('td')[1].text.split() delta = timedelta(**{valueunit: float(valueago)}) date = datetime.now() - delta files = [] for tr in trs[15:]: files.append(unicode(tr.cssselect('td')[1].text)) torrent = Torrent(ih, name) torrent.url = unicode(self.url) torrent.size = get_bytes_size(float(value), unit) torrent.magnet = magnet torrent.seeders = NotAvailable torrent.leechers = NotAvailable torrent.description = NotAvailable torrent.files = files torrent.filename = NotAvailable torrent.date = date return torrent
def get_torrent(self, id): url = NotAvailable magnet = NotAvailable for div in self.document.getiterator('div'): if div.attrib.get('id', '') == 'title': title = unicode(unescape(div.text.strip())) elif div.attrib.get('class', '') == 'download': for link in self.parser.select(div, 'a'): href = link.attrib.get('href', '') # https fails on the download server, so strip it if href.startswith('https://'): href = href.replace('https://', 'http://', 1) if href.startswith('magnet:'): magnet = unicode(href) elif len(href): url = unicode(href) elif div.attrib.get('id', '') == 'details': size = float(div.getchildren()[0].getchildren()[5].text.split('(')[1].split('Bytes')[0]) if len(div.getchildren()) > 1 \ and div.getchildren()[1].attrib.get('class', '') == 'col2': child_to_explore = div.getchildren()[1] else: child_to_explore = div.getchildren()[0] prev_child_txt = "none" seed = "-1" leech = "-1" for ch in child_to_explore.getchildren(): if prev_child_txt == "Seeders:": seed = ch.text if prev_child_txt == "Leechers:": leech = ch.text prev_child_txt = ch.text elif div.attrib.get('class', '') == 'nfo': description = unicode(div.getchildren()[0].text_content().strip()) torrent = Torrent(id, title) torrent.url = url or NotAvailable torrent.magnet = magnet torrent.size = size torrent.seeders = int(seed) torrent.leechers = int(leech) torrent.description = description torrent.files = NotAvailable return torrent
def iter_torrents(self): try: table = self.document.getroot().cssselect('table.torrent_name_tbl') except BrokenPageError: return for i in range(0, len(table), 2): # Title title = table[i].cssselect('td.torrent_name a')[0] name = unicode(title.text) url = unicode(title.attrib['href']) # Other elems elems = table[i+1].cssselect('td') magnet = unicode(elems[0].cssselect('a')[0].attrib['href']) query = urlparse(magnet).query # xt=urn:btih:<...>&dn=<...> btih = parse_qs(query)['xt'][0] # urn:btih:<...> ih = btih.split(':')[-1] value, unit = elems[2].cssselect('span.attr_val')[0].text.split() valueago, valueunit, _ = elems[5].cssselect('span.attr_val')[0].text.split() delta = timedelta(**{valueunit: float(valueago)}) date = datetime.now() - delta url = unicode('https://btdigg.org/search?info_hash=%s' % ih) torrent = Torrent(ih, name) torrent.url = url torrent.size = get_bytes_size(float(value), unit) torrent.magnet = magnet torrent.seeders = NotAvailable torrent.leechers = NotAvailable torrent.description = NotAvailable torrent.files = NotAvailable torrent.date = date yield torrent
def iter_torrents(self): try: table = self.parser.select(self.document.getroot(), 'table#searchResult', 1) except BrokenPageError: return first = True for tr in table.getiterator('tr'): if first: first = False continue if tr.get('class', '') != "header": td = tr.getchildren()[1] div = td.getchildren()[0] link = div.find('a').attrib['href'] title = unicode(unescape(div.find('a').text)) idt = link.split('/')[2] a = td.getchildren()[1] url = unicode(a.attrib['href']) size = td.find('font').text.split(',')[1].strip() u = size.split(' ')[1].split(u'\xa0')[1].replace('i', '') size = size.split(' ')[1].split(u'\xa0')[0] seed = tr.getchildren()[2].text leech = tr.getchildren()[3].text torrent = Torrent(idt, title) torrent.url = url torrent.size = self.unit(float(size), u) torrent.seeders = int(seed) torrent.leechers = int(leech) torrent.description = NotLoaded torrent.files = NotLoaded torrent.magnet = NotLoaded yield torrent
def iter_torrents(self): for tr in self.document.getiterator('tr'): if tr.attrib.get('class', '') == 'hlRow': # sometimes the first tr also has the attribute hlRow # i use that to ditinct it from the others if 'onmouseout' in tr.attrib: size = NotAvailable seed = NotAvailable leech = NotAvailable atitle = tr.getchildren()[2].getchildren()[1] title = unicode(atitle.text) if not title: title = u'' for bold in atitle.getchildren(): if bold.text: title += bold.text if bold.tail: title += bold.tail idt = tr.getchildren()[2].getchildren()[0].attrib.get( 'href', '') idt = idt.split('/')[2] size = tr.getchildren()[3].text u = size[-2:] size = float(size[:-3]) sseed = tr.getchildren()[4].text sleech = tr.getchildren()[5].text if sseed is not None and sseed != "": seed = int(sseed) if sleech is not None and sleech != "": leech = int(sleech) url = u'https://isohunt.com/download/%s/mon_joli_torrent.torrent' % idt torrent = Torrent(idt, title) torrent.url = url torrent.size = get_bytes_size(size, u) torrent.seeders = seed torrent.leechers = leech torrent.description = NotLoaded torrent.files = NotLoaded yield torrent
def get_torrent(self, id): description = "No description" for div in self.document.getiterator('div'): if div.attrib.get('id','') == 'desc': description = div.text.strip() for td in self.document.getiterator('td'): if td.attrib.get('class','') == 'hreview-aggregate': seed = int(td.getchildren()[2].getchildren()[0].getchildren()[0].text) leech = int(td.getchildren()[2].getchildren()[1].getchildren()[0].text) url = td.getchildren()[3].getchildren()[0].attrib.get('href') title = td.getchildren()[1].getchildren()[0].getchildren()[0].text size = 0 for span in self.document.getiterator('span'): if span.attrib.get('class','') == "folder" or span.attrib.get('class','') == "folderopen": size = span.getchildren()[1].tail u = size.split(' ')[-1].split(')')[0] size = float(size.split(': ')[1].split(' ')[0].replace(',','.')) files = [] for td in self.document.getiterator('td'): if td.attrib.get('class','') == 'torFileName': files.append(td.text) torrent = Torrent(id, title) torrent = Torrent(id, title) torrent.url = url torrent.size = self.unit(size,u) torrent.seeders = int(seed) torrent.leechers = int(leech) torrent.description = description torrent.files = files return torrent
def iter_torrents(self): for tr in self.document.getiterator('tr'): if tr.attrib.get('class', '') == 'odd' or tr.attrib.get( 'class', '') == ' even': if not 'id' in tr.attrib: continue title = tr.getchildren()[0].getchildren()[1].getchildren( )[1].text if not title: title = '' for red in tr.getchildren()[0].getchildren()[1].getchildren( )[1].getchildren(): title += red.text_content() idt = tr.getchildren()[0].getchildren()[1].getchildren()[1].attrib.get('href', '').replace('/', '') \ .replace('.html', '') # look for url for a in tr.getchildren()[0].getiterator('a'): if '.torrent' in a.attrib.get('href', ''): url = a.attrib['href'] size = tr.getchildren()[1].text u = tr.getchildren()[1].getchildren()[0].text size = size = size.replace(',', '.') size = float(size) seed = tr.getchildren()[4].text leech = tr.getchildren()[5].text torrent = Torrent(idt, title) torrent.url = url torrent.filename = parse_qs(urlsplit(url).query).get( 'title', [None])[0] torrent.size = get_bytes_size(size, u) torrent.seeders = int(seed) torrent.leechers = int(leech) yield torrent
def iter_torrents(self): table = self.document.getroot().cssselect('table.torrent_table') if not table: table = self.document.getroot().cssselect( 'table#browse_torrent_table') if table: table = table[0] current_group = None for tr in table.findall('tr'): if tr.attrib.get('class', '') == 'colhead': # ignore continue if tr.attrib.get('class', '') == 'group': tds = tr.findall('td') current_group = u'' div = tds[-6] if div.getchildren()[0].tag == 'div': div = div.getchildren()[0] for a in div.findall('a'): if not a.text: continue if current_group: current_group += ' - ' current_group += a.text elif tr.attrib.get('class', '').startswith('group_torrent') or \ tr.attrib.get('class', '').startswith('torrent'): tds = tr.findall('td') title = current_group if len(tds) == 7: # Under a group i = 0 elif len(tds) in (8, 9): # An alone torrent i = len(tds) - 1 while i >= 0 and tds[i].find('a') is None: i -= 1 else: # Useless title continue if title: title += u' (%s)' % tds[i].find('a').text else: title = ' - '.join( [a.text for a in tds[i].findall('a')]) url = urlparse.urlparse(tds[i].find('a').attrib['href']) params = parse_qs(url.query) if 'torrentid' in params: id = '%s.%s' % (params['id'][0], params['torrentid'][0]) else: url = tds[i].find('span').find('a').attrib['href'] m = self.TORRENTID_REGEXP.match(url) if not m: continue id = '%s.%s' % (params['id'][0], m.group(1)) try: size, unit = tds[i + 3].text.split() except ValueError: size, unit = tds[i + 2].text.split() size = get_bytes_size(float(size.replace(',', '')), unit) seeders = int(tds[-2].text) leechers = int(tds[-1].text) torrent = Torrent(id, title) torrent.url = self.format_url(url) torrent.size = size torrent.seeders = seeders torrent.leechers = leechers yield torrent else: debug('unknown attrib: %s' % tr.attrib)
def get_torrent(self, id): table = self.browser.parser.select(self.document.getroot(), 'div.thin', 1) h2 = table.xpath('.//h2') if len(h2) > 0: title = u''.join([txt.strip() for txt in h2[0].itertext()]) else: title = self.browser.parser.select(table, 'div.title_text', 1).text torrent = Torrent(id, title) torrentid = id.split('.', 1)[1] if '.' in id else id table = self.browser.parser.select(self.document.getroot(), 'table.torrent_table') if len(table) == 0: table = self.browser.parser.select(self.document.getroot(), 'div.main_column', 1) is_table = False else: table = table[0] is_table = True for tr in table.findall('tr' if is_table else 'div'): if is_table and 'group_torrent' in tr.attrib.get('class', ''): tds = tr.findall('td') if len(tds) != 5: continue url = tds[0].find('span').find('a').attrib['href'] m = self.TORRENTID_REGEXP.match(url) if not m: warning('ID not found') continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tds[1].text.split() torrent.size = get_bytes_size(float(size.replace(',', '')), unit) torrent.seeders = int(tds[3].text) torrent.leechers = int(tds[4].text) break elif not is_table and tr.attrib.get('class', '').startswith('torrent_widget') \ and tr.attrib.get('class', '').endswith('pad'): url = tr.cssselect('a[title=Download]')[0].attrib['href'] m = self.TORRENTID_REGEXP.match(url) if not m: warning('ID not found') continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tr.cssselect( 'div.details_title strong')[-1].text.strip('()').split() torrent.size = get_bytes_size(float(size.replace(',', '')), unit) torrent.seeders = int( tr.cssselect('img[title=Seeders]')[0].tail) torrent.leechers = int( tr.cssselect('img[title=Leechers]')[0].tail) break if not torrent.url: warning('Torrent %s not found in list' % torrentid) return None div = self.parser.select(self.document.getroot(), 'div.main_column', 1) for box in div.cssselect('div.box'): title = None body = None title_t = box.cssselect('div.head') if len(title_t) > 0: title_t = title_t[0] if title_t.find('strong') is not None: title_t = title_t.find('strong') if title_t.text is not None: title = title_t.text.strip() body_t = box.cssselect('div.body,div.desc') if body_t: body = html2text(self.parser.tostring(body_t[-1])).strip() if title and body: if torrent.description is NotLoaded: torrent.description = u'' torrent.description += u'%s\n\n%s\n' % (title, body) divs = self.document.getroot().cssselect( 'div#files_%s,div#filelist_%s,tr#torrent_%s td' % (torrentid, torrentid, torrentid)) if divs: torrent.files = [] for div in divs: table = div.find('table') if table is None: continue for tr in table: if tr.attrib.get('class', None) != 'colhead_dark': torrent.files.append(tr.find('td').text) return torrent
def get_torrent(self, id): seed = 0 leech = 0 description = NotAvailable url = NotAvailable magnet = NotAvailable title = NotAvailable for div in self.document.getiterator('div'): if div.attrib.get('id', '') == 'desc': try: description = unicode(div.text_content().strip()) except UnicodeDecodeError: description = 'Description with invalid UTF-8.' elif div.attrib.get('class', '') == 'seedBlock': if div.getchildren()[1].text is not None: seed = int(div.getchildren()[1].text) else: seed = 0 elif div.attrib.get('class', '') == 'leechBlock': if div.getchildren()[1].text is not None: leech = int(div.getchildren()[1].text) else: leech = 0 title = self.parser.select(self.document.getroot(), 'h1.novertmarg span', 1) title = unicode(title.text) for a in self.parser.select(self.document.getroot(), 'div.downloadButtonGroup a'): href = a.attrib.get('href', '') if href.startswith('magnet'): magnet = unicode(href) elif href.startswith('//'): url = u'https:%s' % href elif href.startswith('http'): url = unicode(href) size = 0 u = '' for span in self.document.getiterator('span'): # sometimes there are others span, this is not so sure but the size of the children list # is enough to know if this is the right span if (span.attrib.get('class', '') in ['folder', 'folderopen'] and len(span.getchildren()) > 2): size = span.getchildren()[1].tail u = span.getchildren()[2].text size = float(size.split(': ')[1].replace(',', '.')) files = [ td.text for td in self.document.getiterator('td') if td.attrib.get('class', '') == 'torFileName' ] torrent = Torrent(id, title) torrent.url = url if torrent.url: torrent.filename = parse_qs(urlsplit(url).query).get( 'title', [None])[0] torrent.magnet = magnet torrent.size = get_bytes_size(size, u) torrent.seeders = int(seed) torrent.leechers = int(leech) if description == '': description = NotAvailable torrent.description = description torrent.files = files return torrent
def iter_torrents(self): table = self.document.getroot().cssselect('table.torrent_table') if not table: table = self.document.getroot().cssselect('table#browse_torrent_table') if table: table = table[0] current_group = None for tr in table.findall('tr'): if tr.attrib.get('class', '') == 'colhead': # ignore continue if tr.attrib.get('class', '') == 'group': tds = tr.findall('td') current_group = u'' div = tds[-6] if div.getchildren()[0].tag == 'div': div = div.getchildren()[0] for a in div.findall('a'): if not a.text: continue if current_group: current_group += ' - ' current_group += a.text elif tr.attrib.get('class', '').startswith('group_torrent') or \ tr.attrib.get('class', '').startswith('torrent'): tds = tr.findall('td') title = current_group if len(tds) == 7: # Under a group i = 0 elif len(tds) in (8, 9): # An alone torrent i = len(tds) - 1 while i >= 0 and tds[i].find('a') is None: i -= 1 else: # Useless title continue if title: title += u' (%s)' % tds[i].find('a').text else: title = ' - '.join([a.text for a in tds[i].findall('a')]) url = urlparse.urlparse(tds[i].find('a').attrib['href']) params = parse_qs(url.query) if 'torrentid' in params: id = '%s.%s' % (params['id'][0], params['torrentid'][0]) else: url = tds[i].find('span').find('a').attrib['href'] m = self.TORRENTID_REGEXP.match(url) if not m: continue id = '%s.%s' % (params['id'][0], m.group(1)) try: size, unit = tds[i + 3].text.split() except ValueError: size, unit = tds[i + 2].text.split() size = get_bytes_size(float(size.replace(',', '')), unit) seeders = int(tds[-2].text) leechers = int(tds[-1].text) torrent = Torrent(id, title) torrent.url = self.format_url(url) torrent.size = size torrent.seeders = seeders torrent.leechers = leechers yield torrent else: debug('unknown attrib: %s' % tr.attrib)
def get_torrent(self, id): table = self.browser.parser.select(self.document.getroot(), 'div.thin', 1) h2 = table.xpath('.//h2') if len(h2) > 0: title = u''.join([txt.strip() for txt in h2[0].itertext()]) else: title = self.browser.parser.select(table, 'div.title_text', 1).text torrent = Torrent(id, title) if '.' in id: torrentid = id.split('.', 1)[1] else: torrentid = id table = self.browser.parser.select(self.document.getroot(), 'table.torrent_table') if len(table) == 0: table = self.browser.parser.select(self.document.getroot(), 'div.main_column', 1) is_table = False else: table = table[0] is_table = True for tr in table.findall('tr' if is_table else 'div'): if is_table and 'group_torrent' in tr.attrib.get('class', ''): tds = tr.findall('td') if not len(tds) == 5: continue url = tds[0].find('span').find('a').attrib['href'] m = self.TORRENTID_REGEXP.match(url) if not m: warning('ID not found') continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tds[1].text.split() torrent.size = get_bytes_size(float(size.replace(',', '')), unit) torrent.seeders = int(tds[3].text) torrent.leechers = int(tds[4].text) break elif not is_table and tr.attrib.get('class', '').startswith('torrent_widget') \ and tr.attrib.get('class', '').endswith('pad'): url = tr.cssselect('a[title=Download]')[0].attrib['href'] m = self.TORRENTID_REGEXP.match(url) if not m: warning('ID not found') continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tr.cssselect('div.details_title strong')[-1].text.strip('()').split() torrent.size = get_bytes_size(float(size.replace(',', '')), unit) torrent.seeders = int(tr.cssselect('img[title=Seeders]')[0].tail) torrent.leechers = int(tr.cssselect('img[title=Leechers]')[0].tail) break if not torrent.url: warning('Torrent %s not found in list' % torrentid) return None div = self.parser.select(self.document.getroot(), 'div.main_column', 1) for box in div.cssselect('div.box'): title = None body = None title_t = box.cssselect('div.head') if len(title_t) > 0: title_t = title_t[0] if title_t.find('strong') is not None: title_t = title_t.find('strong') if title_t.text is not None: title = title_t.text.strip() body_t = box.cssselect('div.body,div.desc') if body_t: body = html2text(self.parser.tostring(body_t[-1])).strip() if title and body: if torrent.description is NotLoaded: torrent.description = u'' torrent.description += u'%s\n\n%s\n' % (title, body) divs = self.document.getroot().cssselect('div#files_%s,div#filelist_%s,tr#torrent_%s td' % (torrentid, torrentid, torrentid)) if divs: torrent.files = [] for div in divs: table = div.find('table') if table is None: continue for tr in table: if tr.attrib.get('class', None) != 'colhead_dark': torrent.files.append(tr.find('td').text) return torrent
def iter_torrents(self): table = self.document.getroot().cssselect("table.torrent_table") if not table: table = self.document.getroot().cssselect("table#browse_torrent_table") if table: table = table[0] current_group = None for tr in table.findall("tr"): if tr.attrib.get("class", "") == "colhead": # ignore continue if tr.attrib.get("class", "") == "group": tds = tr.findall("td") current_group = u"" div = tds[-6] if div.getchildren()[0].tag == "div": div = div.getchildren()[0] for a in div.findall("a"): if not a.text: continue if current_group: current_group += " - " current_group += a.text elif tr.attrib.get("class", "").startswith("group_torrent") or tr.attrib.get("class", "").startswith( "torrent" ): tds = tr.findall("td") title = current_group if len(tds) == 7: # Under a group i = 0 elif len(tds) in (8, 9): # An alone torrent i = len(tds) - 1 while i >= 0 and tds[i].find("a") is None: i -= 1 else: # Useless title continue if title: title += u" (%s)" % tds[i].find("a").text else: title = " - ".join([a.text for a in tds[i].findall("a")]) url = urlparse.urlparse(tds[i].find("a").attrib["href"]) params = parse_qs(url.query) if "torrentid" in params: id = "%s.%s" % (params["id"][0], params["torrentid"][0]) else: url = tds[i].find("span").find("a").attrib["href"] m = self.TORRENTID_REGEXP.match(url) if not m: continue id = "%s.%s" % (params["id"][0], m.group(1)) try: size, unit = tds[i + 3].text.split() except ValueError: size, unit = tds[i + 2].text.split() size = get_bytes_size(float(size.replace(",", "")), unit) seeders = int(tds[-2].text) leechers = int(tds[-1].text) torrent = Torrent(id, title) torrent.url = self.format_url(url) torrent.size = size torrent.seeders = seeders torrent.leechers = leechers yield torrent else: debug("unknown attrib: %s" % tr.attrib)
def get_torrent(self, id): table = self.browser.parser.select(self.document.getroot(), "div.thin", 1) h2 = table.xpath(".//h2") if len(h2) > 0: title = u"".join([txt.strip() for txt in h2[0].itertext()]) else: title = self.browser.parser.select(table, "div.title_text", 1).text torrent = Torrent(id, title) if "." in id: torrentid = id.split(".", 1)[1] else: torrentid = id table = self.browser.parser.select(self.document.getroot(), "table.torrent_table") if len(table) == 0: table = self.browser.parser.select(self.document.getroot(), "div.main_column", 1) is_table = False else: table = table[0] is_table = True for tr in table.findall("tr" if is_table else "div"): if is_table and "group_torrent" in tr.attrib.get("class", ""): tds = tr.findall("td") if not len(tds) == 5: continue url = tds[0].find("span").find("a").attrib["href"] m = self.TORRENTID_REGEXP.match(url) if not m: warning("ID not found") continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tds[1].text.split() torrent.size = get_bytes_size(float(size.replace(",", "")), unit) torrent.seeders = int(tds[3].text) torrent.leechers = int(tds[4].text) break elif ( not is_table and tr.attrib.get("class", "").startswith("torrent_widget") and tr.attrib.get("class", "").endswith("pad") ): url = tr.cssselect("a[title=Download]")[0].attrib["href"] m = self.TORRENTID_REGEXP.match(url) if not m: warning("ID not found") continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tr.cssselect("div.details_title strong")[-1].text.strip("()").split() torrent.size = get_bytes_size(float(size.replace(",", "")), unit) torrent.seeders = int(tr.cssselect("img[title=Seeders]")[0].tail) torrent.leechers = int(tr.cssselect("img[title=Leechers]")[0].tail) break if not torrent.url: warning("Torrent %s not found in list" % torrentid) return None div = self.parser.select(self.document.getroot(), "div.main_column", 1) for box in div.cssselect("div.box"): title = None body = None title_t = box.cssselect("div.head") if len(title_t) > 0: title_t = title_t[0] if title_t.find("strong") is not None: title_t = title_t.find("strong") if title_t.text is not None: title = title_t.text.strip() body_t = box.cssselect("div.body,div.desc") if body_t: body = html2text(self.parser.tostring(body_t[-1])).strip() if title and body: if torrent.description is NotLoaded: torrent.description = u"" torrent.description += u"%s\n\n%s\n" % (title, body) divs = self.document.getroot().cssselect( "div#files_%s,div#filelist_%s,tr#torrent_%s td" % (torrentid, torrentid, torrentid) ) if divs: torrent.files = [] for div in divs: table = div.find("table") if table is None: continue for tr in table: if tr.attrib.get("class", None) != "colhead_dark": torrent.files.append(tr.find("td").text) return torrent
def get_torrent(self, id): table = self.document.getroot().cssselect('div.thin') if not table: warning('No div.thin found') return None h2 = table[0].find('h2') title = h2.text or '' if h2.find('a') != None: title += h2.find('a').text + h2.find('a').tail torrent = Torrent(id, title) table = self.document.getroot().cssselect('table.torrent_table') if not table: warning('No table found') return None for tr in table[0].findall('tr'): if tr.attrib.get('class', '').startswith('group_torrent'): tds = tr.findall('td') if not len(tds) == 5: continue url = tds[0].find('span').find('a').attrib['href'] id = self.TORRENTID_REGEXP.match(url) if not id: warning('ID not found') continue id = id.group(1) if id != torrent.id: continue torrent.url = self.format_url(url) torrent.size = self.unit(*tds[1].text.split()) torrent.seeders = int(tds[3].text) torrent.leechers = int(tds[4].text) break if not torrent.url: warning('Torrent %d not found in list' % torrent.id) return None div = self.document.getroot().cssselect('div.main_column') if not div: warning('WTF') return None for box in div[0].cssselect('div.box'): title = None body = None title_t = box.cssselect('div.head') if title_t: title = title_t[0].find('strong').text.strip() body_t = box.cssselect('div.body') if body_t: body = html2text(self.browser.parser.tostring(body_t[0])).strip() if title and body: if torrent.description is NotLoaded: torrent.description = u'' torrent.description += u'%s\n\n%s\n' % (title, body) div = self.document.getroot().cssselect('div#files_%s' % torrent.id) if div: torrent.files = [] for tr in div[0].find('table'): if tr.attrib.get('class', None) != 'colhead_dark': torrent.files.append(tr.find('td').text) return torrent
def get_torrent(self, id): title = NotAvailable size = NotAvailable url = 'https://isohunt.com/download/%s/%s.torrent' % (id, id) title = unicode( self.parser.select(self.document.getroot(), 'head > meta[name=title]', 1).attrib.get('content', '')) seed = NotAvailable leech = NotAvailable tip_id = "none" for span in self.document.getiterator('span'): if span.attrib.get('style', '') == 'color:green;' and ( 'ShowTip' in span.attrib.get('onmouseover', '')): seed = int(span.tail.split(' ')[1]) tip_id = span.attrib.get('onmouseover', '').split("'")[1] for div in self.document.getiterator('div'): # find the corresponding super tip which appears on super mouse hover! if div.attrib.get('class', '') == 'dirs ydsf' and tip_id in div.attrib.get( 'id', ''): leech = int( div.getchildren()[0].getchildren()[1].tail.split(' ')[2]) # the <b> with the size in it doesn't have a distinction # have to get it by higher elif div.attrib.get('id', '') == 'torrent_details': size = div.getchildren()[6].getchildren()[0].getchildren( )[0].text u = size[-2:] size = float(size[:-3]) size = get_bytes_size(size, u) # files and description (uploader's comment) description = NotAvailable files = [] count_p_found = 0 for p in self.document.getiterator('p'): if p.attrib.get('style', '') == 'line-height:1.2em;margin-top:1.8em': count_p_found += 1 if count_p_found == 1: if p.getchildren()[1].tail is not None: description = unicode(p.getchildren()[1].tail) if count_p_found == 2: if p.getchildren()[0].text == 'Directory:': files.append(p.getchildren()[0].tail.strip() + '/') else: files.append(p.getchildren()[0].tail.strip()) for td in self.document.getiterator('td'): if td.attrib.get('class', '') == 'fileRows': filename = td.text for slash in td.getchildren(): filename += '/' filename += slash.tail files.append(filename) torrent = Torrent(id, title) torrent.url = url torrent.size = size torrent.seeders = seed torrent.leechers = leech torrent.description = description torrent.files = files return torrent