Beispiel #1
0
class ResultTest(unittest.TestCase):

    def setUp(self):
        self.result = Result()
        self.result.title = 'test title'

    # Include
    def test_not_validate_result_include_regex(self):
        regex = re.compile('\\bother\\b', re.I)
        self.assertFalse(self.result._validate_title(include=regex))

    def test_not_validate_result_include_string(self):
        regex = '\\bother\\b'
        self.assertFalse(self.result._validate_title(include=regex))

    def test_validate_result_include_regex(self):
        regex = re.compile('\\btest\\b', re.I)
        self.assertTrue(self.result._validate_title(include=regex))

    def test_validate_result_include_string(self):
        regex = '\\btest\\b'
        self.assertTrue(self.result._validate_title(include=regex))

    # Exclude
    def test_not_validate_result_exclude_regex(self):
        regex = re.compile('\\btest\\b', re.I)
        self.assertFalse(self.result._validate_title(exclude=regex))

    def test_not_validate_result_exclude_string(self):
        regex = '\\btest\\b'
        self.assertFalse(self.result._validate_title(exclude=regex))

    def test_validate_result_exclude_regex(self):
        regex = re.compile('\\bother\\b', re.I)
        self.assertTrue(self.result._validate_title(exclude=regex))

    def test_validate_result_exclude_string(self):
        regex = '\\bother\\b'
        self.assertTrue(self.result._validate_title(exclude=regex))
Beispiel #2
0
    def results(self, query, sort='date', pages_max=1, **kwargs):
        sort = SORT_DEF[sort]
        for page in range(1, pages_max + 1):
            data = self._send(query, page, sort)
            tree = etree.fromstring(data)
            try:
                results = int(tree.xpath('hasResults')[0].text)
            except (ValueError, IndexError):
                raise SearchError('failed to get results count from "%s"' % data)
            if not results:
                return
            hits = int(tree.xpath('results/hitsForThisPage')[0].text)
            if not hits:
                return
            for res in tree.xpath('results/hits'):
                url = res.xpath('link')[0].text
                if not url:
                    logger.error('failed to get url from %s', data)
                    continue
                size = res.xpath('size')[0].text
                if not size:
                    logger.error('failed to get size from %s', data)
                    continue
                date = res.xpath('added')[0].text
                if not date:
                    logger.error('failed to get date from %s', data)
                    continue

                result = Result()
                result.auto = False
                result.type = 'filestube'
                result.title = clean(res.xpath('name')[0].text)
                result.url = url
                result.size = get_size(size)
                result.date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
                if not result.validate(**kwargs):
                    continue
                yield result
Beispiel #3
0
    def results(self, query, category=None, sort='date', pages_max=1,
            **kwargs):
        if not self.url:
            raise SearchError('no data')

        for page in range(1, pages_max + 1):
            if page > 1:
                if not self._next(page):
                    break
            else:
                if is_url(query):
                    if not self.browser.open(query):
                        raise SearchError('no data')
                else:
                    fields = {'q': query}
                    if category:
                        val = CAT_DEF.get(category.lower())
                        if val:
                            fields['t'] = [val]
                    if not self.browser.submit_form(self.url, fields=fields):
                        raise SearchError('no data')
                    self._sort(sort)

            lis = self.browser.cssselect('#torrents li')
            if not lis:
                if lis is None:
                    raise SearchError('no data')
                elif RE_OVERLOAD.search(self.browser.tree.text_content()):
                    raise SearchError('overload')

            for el in lis:
                log = html.tostring(el, pretty_print=True)[:1000]

                result = Result()
                result.type = 'torrent'
                result.safe = False

                links = el.cssselect('a')
                if not links:
                    logger.error('failed to get title from %s', log)
                    continue
                result.title = clean(html.tostring(links[0]))

                details = el.cssselect('.torInfo')
                if not details:
                    logger.error('failed to get details from %s', log)
                    continue
                res = RE_DETAILS.search(html.tostring(details[0]))
                if not res:
                    continue
                result.category = res.group(1).strip(' ').lower()

                date = res.group(3)
                result.date = self._get_date(date)
                if not result.date:
                    logger.error('failed to get date from "%s"', date)
                    continue

                seeds = details[0].cssselect('span.seeders')
                if seeds:
                    try:
                        result.seeds = int(seeds[0].text.replace(',', ''))
                    except ValueError:
                        pass

                tds = el.cssselect('tr td')
                if not tds:
                    logger.error('failed to get size from %s', log)
                    continue
                if not result.get_size(tds[0].text):
                    continue

                url_info = urljoin(self.url, links[0].get('href')).encode('utf-8')
                result.url = self._get_torrent_url(url_info)
                if not result.url:
                    logger.error('failed to get magnet url from %s', url_info)
                    continue
                if not result.get_hash():
                    continue

                if not result.validate(**kwargs):
                    continue

                yield result
Beispiel #4
0
 def setUp(self):
     self.result = Result()
     self.result.title = 'test title'
Beispiel #5
0
    def results(self, query, category=None, sort='date', pages_max=1,
            **kwargs):
        if not self.url:
            raise SearchError('no data')

        for page in range(1, pages_max + 1):
            if page > 1:
                if not self._next(page):
                    break
            else:
                if is_url(query):
                    if not self.browser.open(query):
                        raise SearchError('no data')
                else:
                    fields = {'q': query}
                    if category:
                        val = CAT_DEF.get(category.lower())
                        if val:
                            fields[val] = ['on']
                    if not self.browser.submit_form(self.url, fields=fields):
                        raise SearchError('no data')
                    self._sort(sort)

            trs = self.browser.cssselect('#searchResult tr:not([class="header"])')
            if not trs:
                if trs is None:
                    raise SearchError('no data')
                elif RE_OVERLOAD.search(self.browser.tree.text_content()):
                    raise SearchError('overload')

            for tr in trs:
                if len(tr) < 4:
                    continue

                log = html.tostring(tr, pretty_print=True)[:1000]

                result = Result()
                result.type = 'torrent'
                result.safe = False

                try:
                    result.category = tr[0].cssselect('a')[0].text.lower()
                except Exception:
                    logger.error('failed to get category from %s', log)

                res = tr.cssselect('div.detName a')
                if not res:
                    logger.error('failed to get title from %s', log)
                    continue
                result.title = res[0].text

                result.url = self._get_torrent_url(tr)
                if not result.url:
                    logger.error('failed to get magnet url from %s', log)
                    continue
                if not result.get_hash():
                    continue

                res = tr.cssselect('.detDesc')
                if not res:
                    logger.error('failed to get details from %s', log)
                    continue
                details = clean(html.tostring(res[0]))
                res_ = RE_DETAILS.search(details)
                if not res_:
                    logger.error('failed to parse details: %s', details)
                    continue
                date, size = res_.groups()
                if not result.get_size(size):
                    continue

                if not result.validate(**kwargs):
                    continue

                try:
                    result.date = self._get_date(date)
                except Exception, e:
                    logger.error('failed to get date from "%s": %s', date, str(e))
                    continue

                try:
                    result.seeds = int(tr[2].text)
                except Exception:
                    pass
                yield result
Beispiel #6
0
    def results(self, query, category=None, sort='date', pages_max=1,
            **kwargs):
        if not self.url:
            raise SearchError('no data')

        for page in range(1, pages_max + 1):
            if page > 1:
                if not self._next(page):
                    break
            else:
                if is_url(query):
                    if not self.browser.open(query):
                        raise SearchError('no data')
                else:
                    if not self.browser.submit_form(self.url,
                            index=0, fields={'q': query}):
                        raise SearchError('no data')
                    if sort != 'popularity':     # default sort is peers ('popularity')
                        self._sort(sort)

            divs = self.browser.cssselect('div.results')
            if divs is None:
                raise SearchError('no data')

            # Skip approximate matches
            res = self.browser.cssselect('div.results h3')
            if res and RE_APPROXIMATE_MATCH.search(html.tostring(res[0])):
                break

            for div in divs:
                # Skip sponsored links
                res = div.cssselect('h2')
                if res and RE_SPONSORED_LINK.search(html.tostring(res[0])):
                    continue

                for dl in div.cssselect('dl'):
                    links = dl.cssselect('a')
                    if not links:
                        continue

                    log = html.tostring(dl, pretty_print=True)[:1000]

                    result = Result()
                    result.type = 'torrent'
                    result.safe = False

                    title = self.get_link_text(html.tostring(links[0]))
                    if not title:
                        continue
                    result.title = clean(title)

                    try:
                        res = RE_CATEGORIES.search(html.tostring(links[0]))
                        result.category = self._get_category(res.group(1))
                    except Exception:
                        logger.error('failed to get category info from %s', log)

                    if category and category != result.category:
                        continue

                    if dl.cssselect('span.pe'):     # skip 'pending' results (missing date and size)
                        continue

                    try:
                        date = dl.cssselect('.a')[0][0].get('title')
                        result.date = self._get_date(date)
                    except Exception:
                        logger.debug('failed to get date from %s', log)
                        continue
                    try:
                        size = dl.cssselect('.s')[0].text
                    except Exception:
                        logger.debug('failed to get size from %s', log)
                        continue
                    if not result.get_size(size):
                        continue

                    if not result.validate(**kwargs):
                        continue

                    try:
                        seeds = dl.cssselect('.d')[0].text
                        result.seeds = int(seeds.replace(',', ''))
                    except Exception:
                        logger.debug('failed to get seeds from %s', log)

                    # Find torrent url
                    url_info = urljoin(self.url, links[0].get('href'))
                    result.url = self._get_torrent_url(query, url_info)
                    if not result.url:
                        continue
                    if not result.get_hash():
                        continue
                    yield result
Beispiel #7
0
    def results(self, query, sort='date', pages_max=1, **kwargs):
        if not self.url:
            raise SearchError('no data')

        url = None

        for i in range(pages_max):
            if i == 0:
                if not self.browser.submit_form(url, fields={'q': query}):
                    raise SearchError('no data')
            else:
                tables = self.browser.cssselect('table')
                if not tables:
                    continue
                links = tables[-1].cssselect('a')
                if not links:
                    break
                next_text = self.get_link_text(html.tostring(links[-1]))
                if next_text != '&gt;':
                    break
                url = urljoin(self.url, links[-1].get('href'))
                if not self.browser.open(url):
                    raise SearchError('no data')

            for tr in self.browser.cssselect('table#r2 tr', []):
                if tr.cssselect('th'):
                    continue

                log = html.tostring(tr, pretty_print=True)[:1000]

                result = Result()
                result.type = 'binsearch'

                titles = tr.cssselect('span.s')
                if not titles:
                    continue
                title = titles[0].text
                res = RE_TITLE.findall(title)
                if res:
                    title = res[0]
                result.title = clean(title)

                age = tr[-1].text
                if not age:
                    logger.error('failed to get age from %s', log)
                result.date = self._get_date(age)

                refs = tr.cssselect('input[type="checkbox"]')
                if not refs:
                    logger.error('failed to get references list from %s', log)
                    continue
                ref = refs[0].get('name')
                if not ref:
                    logger.error('failed to get reference from %s', log)
                    continue
                result.ref = ref

                info = tr.cssselect('span.d')
                if not info:
                    continue
                links = info[0].cssselect('a')
                if not links or not RE_COLLECTION.search(links[0].text):
                    continue
                result.url = urljoin(self.url, links[0].get('href'))

                info = clean(html.tostring(info[0]))
                if RE_PASSWORD.search(info):
                    continue

                res = RE_SIZE.search(info)
                if not res:
                    continue
                result.size = get_size(res.group(1))

                res = RE_PARTS.search(info)
                if not res or res.group(1) != res.group(2):
                    continue

                if not result.validate(**kwargs):
                    continue
                yield result
Beispiel #8
0
    def results(self, query, category=None, sort='date', pages_max=1,
            **kwargs):
        if not self.url:
            raise SearchError('no data')

        for page in range(1, pages_max + 1):
            if page > 1:
                if not self._next(page):
                    break
            else:
                if is_url(query):
                    if not self.browser.open(query):
                        raise SearchError('no data')
                else:
                    fields = {'ihq': query}
                    if not self.browser.submit_form(self.url,
                            fields=fields):
                        raise SearchError('no data')
                    self._sort(sort)

            trs = self.browser.cssselect('.table-torrents tr[data-key]')
            if not trs:
                if trs is None:
                    raise SearchError('no data')
                elif RE_OVERLOAD.search(self.browser.tree.text_content()):
                    raise SearchError('overload')

            for tr in trs:
                log = html.tostring(tr, pretty_print=True)[:1000]

                result = Result()
                result.type = 'torrent'
                result.safe = False

                category_ = tr.cssselect('.category-row span')
                if not category_:
                    category = None
                else:
                    try:
                        category = category_[0].get('title').lower()
                    except Exception:
                        category = None
                if not category:
                    logger.error('failed to get category from %s', log)
                else:
                    result.category = category

                links_ = tr.cssselect('.title-row a')
                if not links_:
                    logger.error('failed to get title link from %s', log)
                    continue
                try:
                    result.title = links_[0].cssselect('span')[0].text
                except Exception:
                    logger.error('failed to get title from %s', log)
                    continue

                url_info = urljoin(self.url, links_[0].get('href'))

                size_ = tr.cssselect('.size-row')
                if not size_:
                    logger.error('failed to get size from %s', log)
                    continue
                size = size_[0].text
                if not result.get_size(size):
                    logger.error('failed to get size from "%s"', size)
                    continue

                date_ = tr.cssselect('.date-row')
                if not date_:
                    logger.error('failed to get size from %s', log)
                    continue
                date = date_[0].text
                try:
                    result.date = self._get_date(date)
                except Exception, e:
                    logger.error('failed to get date from "%s": %s', date, str(e))
                    continue

                if not result.validate(**kwargs):
                    continue

                result.url = self._get_torrent_url(url_info)
                if not result.url:
                    logger.error('failed to get magnet url from %s', url_info)
                    continue
                if not result.get_hash():
                    continue

                try:
                    result.seeds = int(tr[-2].text)
                except Exception:
                    logger.error('failed to get seeds from %s', log)

                yield result
Beispiel #9
0
    def results(self, query, category=None, pages_max=1, **kwargs):
        if not self.url:
            raise SearchError('no data')

        url = '%s?%s' % (QUERY_URL, urlencode({'nm': query}))
        for page in range(1, pages_max + 1):
            if page > 1:
                if not self._next(page):
                    break
            else:
                if not self.browser.open(url):
                    raise SearchError('no data')

            trs = self.browser.cssselect('#tor-tbl tbody tr')
            if not trs:
                if trs is None:
                    raise SearchError('no data')
                elif RE_OVERLOAD.search(self.browser.tree.text_content()):
                    raise SearchError('overload')

            for el in trs:
                if len(el) == 1:
                    continue
                log = html.tostring(el, pretty_print=True)[:1000]

                result = Result()
                result.type = 'rutracker'
                result.safe = False
                result.category = None

                links = el[3].cssselect('a')
                if not links:
                    logger.error('failed to get title from %s', log)
                    continue
                result.title = clean(html.tostring(links[0]))

                links = el[5].cssselect('a')
                if not links:
                    logger.debug('failed to get torrent url from %s', html.tostring(el[5]))
                    continue
                result.url = links[0].get('href')

                size = clean(links[0].text or '').replace('_', ' ').strip()
                if not result.get_size(size):
                    continue

                seeds = el[6].cssselect('.seedmed')
                if seeds:
                    try:
                        result.seeds = int(seeds[0].text)
                    except ValueError:
                        pass

                els = el[9].cssselect('u')
                if not els:
                    logger.error('failed to get date from %s', log)
                    continue
                try:
                    result.date = datetime.utcfromtimestamp(int(els[0].text))
                except ValueError:
                    logger.error('failed to get date from %s', els[0].text)
                    continue

                if not result.validate(**kwargs):
                    continue

                yield result