Python BS4Parser Examples, medusa.bs4_parser.BS4Parser Python Examples

Example #1

0

Show file

    def parse(self, data, mode):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS

        :return: A list of items found
        """
        items = []

        with BS4Parser(data, 'html5lib') as soup:
            torrent_table = soup.find('table', class_='listing')
            torrent_rows = torrent_table('tr') if torrent_table else []

            # Continue only if at least one release is found
            if len(torrent_rows) < 2:
                log.debug('Data returned from provider does not contain any torrents')
                return items

            a = 1 if len(torrent_rows[0]('td')) < 2 else 0

            # Skip column headers
            for top, bot in zip(torrent_rows[a::2], torrent_rows[a + 1::2]):
                try:
                    desc_top = top.find('td', class_='desc-top')
                    title = desc_top.get_text(strip=True) if desc_top else None
                    download_url = desc_top.find('a')['href'] if desc_top else None
                    if not all([title, download_url]):
                        continue

                    stats = bot.find('td', class_='stats').get_text(strip=True)
                    sl = re.match(r'S:(?P<seeders>\d+)L:(?P<leechers>\d+)C:(?:\d+)ID:(?:\d+)', stats.replace(' ', ''))
                    seeders = try_int(sl.group('seeders')) if sl else 0
                    leechers = try_int(sl.group('leechers')) if sl else 0

                    # Filter unseeded torrent
                    if seeders < min(self.minseed, 1):
                        if mode != 'RSS':
                            log.debug("Discarding torrent because it doesn't meet the"
                                      " minimum seeders: {0}. Seeders: {1}",
                                      title, seeders)
                        continue

                    desc_bottom = bot.find('td', class_='desc-bot').get_text(strip=True)
                    size = convert_size(desc_bottom.split('|')[1].strip('Size: ')) or -1

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                        'pubdate': None,
                    }
                    if mode != 'RSS':
                        log.debug('Found result: {0} with {1} seeders and {2} leechers',
                                  title, seeders, leechers)

                    items.append(item)
                except (AttributeError, TypeError, KeyError, ValueError, IndexError):
                    log.exception('Failed parsing provider.')

        return items

Example #2

0

Show file

    def parse(self, data, mode):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS

        :return: A list of items found
        """
        def process_column_header(td):
            result = ''
            if td.a and td.a.img:
                result = td.a.img.get('title', td.a.get_text(strip=True))
            if not result:
                result = td.get_text(strip=True)
            if not result and td.a and td.a.get('title'):
                result = td.a['title']
            return result

        items = []

        with BS4Parser(data, 'html5lib') as html:
            torrent_table = html.find('table', class_='torrent_table')
            torrent_rows = torrent_table('tr') if torrent_table else []

            # Continue only if at least one release is found
            if len(torrent_rows) < 2:
                log.debug(
                    'Data returned from provider does not contain any torrents'
                )
                return items

            # Need to only search one level deep for 'td' tags, as one of the td's also has a td.
            labels = [
                process_column_header(label)
                for label in torrent_rows[0].find_all('td', recursive=False)
            ]

            # Skip column headers
            for row in torrent_rows[1:]:
                cells = row.find_all('td', recursive=False)
                if len(cells) < len(labels):
                    continue

                try:
                    # Skip if torrent has been nuked due to poor quality
                    if row.find('img', alt='Nuked'):
                        continue

                    title = cells[labels.index('Name')].find(
                        'a', class_='overlay_torrent').get_text(strip=True)
                    download_url = urljoin(
                        self.url,
                        cells[labels.index('Name')].find('a')['href'])
                    if not all([title, download_url]):
                        continue

                    seeders = int(cells[labels.index('Seeders')].get_text(
                        strip=True).replace(',', ''))
                    leechers = int(cells[labels.index('Leechers')].get_text(
                        strip=True).replace(',', ''))

                    # Filter unseeded torrent
                    if seeders < self.minseed:
                        if mode != 'RSS':
                            log.debug(
                                "Discarding torrent because it doesn't meet the"
                                ' minimum seeders: {0}. Seeders: {1}', title,
                                seeders)
                        continue

                    units = ['B', 'KIB', 'MIB', 'GIB', 'TB', 'PB']

                    torrent_size = cells[labels.index('Size')].get_text(
                        strip=True)
                    size = convert_size(torrent_size, units=units) or -1

                    pubdate_raw = cells[3].find('span')['title']
                    pubdate = self.parse_pubdate(pubdate_raw)

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                        'pubdate': pubdate,
                    }
                    if mode != 'RSS':
                        log.debug(
                            'Found result: {0} with {1} seeders and {2} leechers',
                            title, seeders, leechers)

                    items.append(item)
                except (AttributeError, TypeError, KeyError, ValueError,
                        IndexError):
                    log.exception('Failed parsing provider.')

        return items

Example #3

0

Show file

File: pretome.py Project: 5l1v3r1/Medusa-2

    def parse(self, data, mode):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS

        :return: A list of items found
        """
        items = []

        with BS4Parser(data, 'html5lib') as html:
            # Continue only if at least one release is found
            empty = html.find('h2',
                              text='No .torrents fit this filter criteria')
            if empty:
                log.debug(
                    'Data returned from provider does not contain any torrents'
                )
                return items

            torrent_table = html.find(
                'table', attrs={'style': 'border: none; width: 100%;'})
            torrent_rows = torrent_table(
                'tr', class_='browse') if torrent_table else []

            for row in torrent_rows:
                cells = row('td')

                try:
                    title = cells[1].find('a').get('title')
                    torrent_url = cells[2].find('a').get('href')
                    download_url = urljoin(self.url, torrent_url)
                    if not all([title, torrent_url]):
                        continue

                    seeders = try_int(cells[9].get_text(), 1)
                    leechers = try_int(cells[10].get_text())

                    # Filter unseeded torrent
                    if seeders < self.minseed:
                        if mode != 'RSS':
                            log.debug(
                                "Discarding torrent because it doesn't meet the"
                                ' minimum seeders: {0}. Seeders: {1}', title,
                                seeders)
                        continue

                    torrent_size = self._norm_size(
                        cells[7].get_text(strip=True))
                    size = convert_size(torrent_size) or -1

                    pubdate_raw = cells[5].get_text()
                    pubdate = self.parse_pubdate(pubdate_raw, human_time=True)

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                        'pubdate': pubdate,
                    }
                    if mode != 'RSS':
                        log.debug(
                            'Found result: {0} with {1} seeders and {2} leechers',
                            title, seeders, leechers)

                    items.append(item)
                except (AttributeError, TypeError, KeyError, ValueError,
                        IndexError):
                    log.exception('Failed parsing provider.')

        return items

Example #4

0

Show file

File: hounddawgs.py Project: steflavoie/Medusa

    def parse(self, data, mode):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS

        :return: A list of items found
        """
        items = []

        with BS4Parser(data, 'html5lib') as html:
            torrent_table = html.find('table', {'id': 'torrent_table'})

            # Continue only if at least one release is found
            if not torrent_table:
                log.debug(
                    'Data returned from provider does not contain any {0}torrents',
                    'ranked ' if self.ranked else '')
                return items

            torrent_body = torrent_table.find('tbody')
            torrent_rows = torrent_body.contents
            del torrent_rows[1::2]

            for row in torrent_rows[1:]:
                try:
                    torrent = row('td')
                    if len(torrent) <= 1:
                        break

                    all_as = (torrent[1])('a')
                    notinternal = row.find(
                        'img', src='/static//common/user_upload.png')
                    if self.ranked and notinternal:
                        log.debug(
                            'Found a user uploaded release, Ignoring it..')
                        continue

                    freeleech = row.find(
                        'img', src='/static//common/browse/freeleech.png')
                    if self.freeleech and not freeleech:
                        continue

                    title = all_as[2].string
                    download_url = urljoin(self.url, all_as[0].attrs['href'])
                    if not all([title, download_url]):
                        continue

                    seeders = try_int((row('td')[6]).text.replace(',', ''))
                    leechers = try_int((row('td')[7]).text.replace(',', ''))

                    # Filter unseeded torrent
                    if seeders < min(self.minseed, 1):
                        if mode != 'RSS':
                            log.debug(
                                "Discarding torrent because it doesn't meet the"
                                " minimum seeders: {0}. Seeders: {1}", title,
                                seeders)
                        continue

                    torrent_size = row.find(
                        'td', class_='nobr').find_next_sibling('td').string
                    if torrent_size:
                        size = convert_size(torrent_size) or -1

                    pubdate_raw = row.find('td',
                                           class_='nobr').find('span')['title']
                    pubdate = self.parse_pubdate(pubdate_raw)

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                        'pubdate': pubdate,
                    }
                    if mode != 'RSS':
                        log.debug(
                            'Found result: {0} with {1} seeders and {2} leechers',
                            title, seeders, leechers)

                    items.append(item)
                except (AttributeError, TypeError, KeyError, ValueError,
                        IndexError):
                    log.error('Failed parsing provider. Traceback: {0!r}',
                              traceback.format_exc())

        return items

Example #5

0

Show file

File: hebits.py Project: zapru/Medusa

    def parse(self, data, mode):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS

        :return: A list of items found
        """
        items = []

        with BS4Parser(data, 'html.parser') as html:
            torrent_table = html.find('div', class_='browse')
            torrent_rows = torrent_table(
                'div', class_=re.compile('^line')) if torrent_table else []

            # Continue only if at least one release is found
            if len(torrent_rows) < 1:
                log.debug(
                    'Data returned from provider does not contain any torrents'
                )
                return items

            for row in torrent_rows:
                try:
                    heb_eng_title = row.find('div', class_='bTitle').find(
                        href=re.compile(r'details\.php')).find('b').get_text()
                    if '/' in heb_eng_title:
                        title = heb_eng_title.split('/')[1].strip()
                    elif '\\' in heb_eng_title:
                        title = heb_eng_title.split('\\')[1].strip()
                    else:
                        continue

                    download_id = row.find('div', class_='bTitle').find(
                        href=re.compile(r'download\.php'))['href']

                    if not all([title, download_id]):
                        continue

                    download_url = urljoin(self.url, download_id)

                    seeders = try_int(
                        row.find('div', class_='bUping').get_text(strip=True))
                    leechers = try_int(
                        row.find('div',
                                 class_='bDowning').get_text(strip=True))

                    # Filter unseeded torrent
                    if seeders < min(self.minseed, 1):
                        if mode != 'RSS':
                            log.debug(
                                "Discarding torrent because it doesn't meet the"
                                ' minimum seeders: {0}. Seeders: {1}', title,
                                seeders)
                        continue

                    torrent_size = row.find(
                        'div', class_='bSize').get_text(strip=True)
                    size = convert_size(torrent_size[5:], sep='') or -1

                    pubdate_raw = row.find('div', class_=re.compile(
                        'bHow')).find_all('span')[1].next_sibling.strip()
                    pubdate = self.parse_pubdate(pubdate_raw)

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                        'pubdate': pubdate,
                    }
                    if mode != 'RSS':
                        log.debug(
                            'Found result: {0} with {1} seeders and {2} leechers',
                            title, seeders, leechers)

                    items.append(item)
                except (AttributeError, TypeError, KeyError, ValueError,
                        IndexError):
                    log.exception('Failed parsing provider.')

        return items

Example #6

0

Show file

File: horriblesubs.py Project: zapru/Medusa

    def parse(self, data, mode):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS

        :return: A list of items found
        """
        items = []

        with BS4Parser(data, 'html5lib') as html:
            torrent_rows = html.find_all(
                class_=['release-info', 'release-links'])

            # Continue only if at least one release is found
            if not torrent_rows:
                log.debug(
                    'Data returned from provider does not contain any torrents'
                )
                return items

            for row in torrent_rows:
                try:
                    if row['class'] == ['release-info']:
                        pubdate = None
                        # pubdate is only supported for non-daily searches
                        if mode != 'RSS':
                            # keep the date and strip the rest
                            pubdate_raw = row.find(
                                'td', class_='rls-label').get_text()[1:9]
                            pubdate = self.parse_pubdate(
                                pubdate_raw, timezone='America/Los_Angeles')
                        continue

                    title = row.find('td', class_='dl-label').get_text()
                    magnet = row.find('td', class_='dl-type hs-magnet-link')
                    download_url = magnet or row.find(
                        'td', class_='dl-type hs-torrent-link')
                    if not all([title, download_url]):
                        continue

                    download_url = download_url.span.a.get('href')

                    # Add HorribleSubs group to the title
                    title = '{group} {title}'.format(group='[HorribleSubs]',
                                                     title=title)

                    # HorribleSubs doesn't provide this information
                    seeders = 1
                    leechers = 0
                    size = -1

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                        'pubdate': pubdate,
                    }
                    if mode != 'RSS':
                        log.debug(
                            'Found result: {0} with {1} seeders and {2} leechers',
                            title, seeders, leechers)

                    items.append(item)
                except (AttributeError, TypeError, KeyError, ValueError,
                        IndexError):
                    log.exception('Failed parsing provider.')

        return items

Example #7

0

Show file

File: scenetime.py Project: trentmsteel/Medusa

    def parse(self, data, mode):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS

        :return: A list of items found
        """
        items = []

        with BS4Parser(data, 'html5lib') as html:
            torrent_rows = html.find_all('tr')

            # Continue only if at least one release is found
            if len(torrent_rows) < 2:
                log.debug('Data returned from provider does not contain any torrents')
                return items

            # Scenetime apparently uses different number of cells in #torrenttable based
            # on who you are. This works around that by extracting labels from the first
            # <tr> and using their index to find the correct download/seeders/leechers td.
            labels = [label.get_text(strip=True) or label.img['title'] for label in torrent_rows[0]('td')]

            # Skip column headers
            for row in torrent_rows[1:]:
                cells = row('td')
                if len(cells) < len(labels):
                    continue

                try:
                    link = cells[labels.index('Name')].find('a')
                    torrent_id = link['href'].replace('details.php?id=', '').split('&')[0]
                    title = link.get_text(strip=True)
                    download_url = self.urls['download'].format(
                        torrent_id,
                        '{0}.torrent'.format(title.replace(' ', '.'))
                    )
                    if not all([title, download_url]):
                        continue

                    seeders = try_int(cells[labels.index('Seeders')].get_text(strip=True))
                    leechers = try_int(cells[labels.index('Leechers')].get_text(strip=True))

                    # Filter unseeded torrent
                    if seeders < min(self.minseed, 1):
                        if mode != 'RSS':
                            log.debug("Discarding torrent because it doesn't meet the"
                                      " minimum seeders: {0}. Seeders: {1}",
                                      title, seeders)
                        continue

                    torrent_size = cells[labels.index('Size')].get_text()
                    torrent_size = re.sub(r'(\d+\.?\d*)', r'\1 ', torrent_size)
                    size = convert_size(torrent_size) or -1

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                        'pubdate': None,
                    }
                    if mode != 'RSS':
                        log.debug('Found result: {0} with {1} seeders and {2} leechers',
                                  title, seeders, leechers)

                    items.append(item)
                except (AttributeError, TypeError, KeyError, ValueError, IndexError):
                    log.error('Failed parsing provider. Traceback: {0!r}',
                              traceback.format_exc())

        return items

Example #8

0

Show file

    def parse(self, data, mode, **kwargs):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS.

        :return: A list of items found
        """
        items = []

        keywords = kwargs.pop('keywords', None)

        with BS4Parser(data, 'html5lib') as html:
            torrent_table = html.find(id='sortabletable')
            torrent_rows = torrent_table('tr') if torrent_table else []

            # Continue only if at least one release is found
            if len(torrent_rows) < 2:
                log.debug(
                    'Data returned from provider does not contain any torrents'
                )
                return items

            labels = [
                label.img['title'] if label.img else label.get_text(strip=True)
                for label in torrent_rows[0]('td')
            ]
            for torrent in torrent_rows[1:]:
                try:
                    if self.freeleech and not torrent.find(
                            'img',
                            alt=re.compile('TORRENT GRATUIT : Seulement '
                                           'l\'upload sera compter.')):
                        continue

                    title = torrent.find(
                        class_='tooltip-content').div.get_text(strip=True)
                    download_url = torrent.find(
                        title='Télécharger le torrent!').parent['href']
                    if not all([title, download_url]):
                        continue

                    # Chop off tracker/channel prefix or we cannot parse the result!
                    if mode != 'RSS' and keywords:
                        show_name_first_word = re.search(r'^[^ .]+',
                                                         keywords).group()
                        if not title.startswith(show_name_first_word):
                            title = re.sub(
                                r'.*(' + show_name_first_word + '.*)', r'\1',
                                title)

                    seeders = try_int(
                        torrent.find(title='Seeders').get_text(strip=True))
                    leechers = try_int(
                        torrent.find(title='Leechers').get_text(strip=True))

                    # Filter unseeded torrent
                    if seeders < min(self.minseed, 1):
                        if mode != 'RSS':
                            log.debug(
                                "Discarding torrent because it doesn't meet the"
                                " minimum seeders: {0}. Seeders: {1}", title,
                                seeders)
                        continue

                    torrent_size = torrent('td')[labels.index(
                        'Taille')].get_text(strip=True)
                    size = convert_size(torrent_size) or -1

                    pubdate_raw = torrent('td')[labels.index('Nom')].find_all(
                        'div')[-1].get_text(strip=True)
                    pubdate = self.parse_pubdate(pubdate_raw, dayfirst=True)

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                        'pubdate': pubdate,
                    }
                    if mode != 'RSS':
                        log.debug(
                            'Found result: {0} with {1} seeders and {2} leechers',
                            title, seeders, leechers)

                    items.append(item)
                except (AttributeError, TypeError, KeyError, ValueError,
                        IndexError):
                    log.error('Failed parsing provider. Traceback: {0!r}',
                              traceback.format_exc())

        return items

Example #9

0

Show file

File: bithdtv.py Project: trentmsteel/Medusa

    def parse(self, data, mode):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS

        :return: A list of items found
        """
        items = []

        with BS4Parser(
                data, 'html.parser'
        ) as html:  # Use html.parser, since html5parser has issues with this site.
            tables = html(
                'table',
                width='800')  # Get the last table with a width of 800px.
            torrent_table = tables[-1] if tables else []
            torrent_rows = torrent_table('tr') if torrent_table else []

            # Continue only if at least one release is found
            if len(torrent_rows) < 2:
                log.debug(
                    'Data returned from provider does not contain any torrents'
                )
                return items

            # Skip column headers
            for row in torrent_rows[1:]:
                cells = row('td')
                if len(cells) < 3:
                    # We must have cells[2] because it contains the title
                    continue

                if self.freeleech and not row.get('bgcolor'):
                    continue

                try:
                    title = cells[2].find('a')['title'] if cells[2] else None
                    download_url = urljoin(
                        self.url,
                        cells[0].find('a')['href']) if cells[0] else None
                    if not all([title, download_url]):
                        continue

                    seeders = try_int(cells[8].get_text(
                        strip=True)) if len(cells) > 8 else 1
                    leechers = try_int(cells[9].get_text(
                        strip=True)) if len(cells) > 9 else 0

                    # Filter unseeded torrent
                    if seeders < min(self.minseed, 1):
                        if mode != 'RSS':
                            log.debug(
                                "Discarding torrent because it doesn't meet the"
                                " minimum seeders: {0}. Seeders: {1}", title,
                                seeders)
                        continue

                    torrent_size = cells[6].get_text(
                        ' ') if len(cells) > 6 else None
                    size = convert_size(torrent_size) or -1

                    pubdate_raw = cells[5].get_text(' ')
                    pubdate = self.parse_pubdate(pubdate_raw)

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                        'pubdate': pubdate,
                    }
                    if mode != 'RSS':
                        log.debug(
                            'Found result: {0} with {1} seeders and {2} leechers',
                            title, seeders, leechers)

                    items.append(item)
                except (AttributeError, TypeError, KeyError, ValueError,
                        IndexError):
                    log.error('Failed parsing provider. Traceback: {0!r}',
                              traceback.format_exc())

        return items

Example #10

0

Show file

File: hdtorrents.py Project: stronics77/Medusa

    def parse(self, data, mode):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS

        :return: A list of items found
        """
        # Units
        units = ['B', 'KIB', 'MIB', 'GIB', 'TIB', 'PIB']

        items = []

        with BS4Parser(data, 'html5lib') as html:
            torrent_table = html.find('table', class_='mainblockcontenttt')
            torrent_rows = torrent_table('tr') if torrent_table else []

            if not torrent_rows or torrent_rows[2].find('td', class_='lista'):
                log.debug('Data returned from provider does not contain any torrents')
                return items

            # Cat., Active, Filename, Dl, Wl, Added, Size, Uploader, S, L, C
            labels = [label.a.get_text(strip=True) if label.a else label.get_text(strip=True) for label in
                      torrent_rows[0]('td')]

            # Skip column headers
            for row in torrent_rows[1:]:
                try:
                    cells = row.findChildren('td')[:len(labels)]
                    if len(cells) < len(labels):
                        continue

                    title = cells[labels.index('Filename')].a
                    title = title.get_text(strip=True) if title else None
                    link = cells[labels.index('Dl')].a
                    link = link.get('href') if link else None
                    download_url = urljoin(self.url, link) if link else None
                    if not all([title, download_url]):
                        continue

                    seeders = try_int(cells[labels.index('S')].get_text(strip=True))
                    leechers = try_int(cells[labels.index('L')].get_text(strip=True))

                    # Filter unseeded torrent
                    if seeders < self.minseed:
                        if mode != 'RSS':
                            log.debug("Discarding torrent because it doesn't meet the"
                                      ' minimum seeders: {0}. Seeders: {1}',
                                      title, seeders)
                        continue

                    torrent_size = cells[labels.index('Size')].get_text()
                    size = convert_size(torrent_size, units=units) or -1

                    pubdate_raw = cells[labels.index('Added')].get_text()
                    pubdate = self.parse_pubdate(pubdate_raw)

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                        'pubdate': pubdate,
                    }
                    if mode != 'RSS':
                        log.debug('Found result: {0} with {1} seeders and {2} leechers',
                                  title, seeders, leechers)

                    items.append(item)
                except (AttributeError, TypeError, KeyError, ValueError, IndexError):
                    log.exception('Failed parsing provider.')

        return items

Example #11

0

Show file

File: btdb.py Project: tortfeaser/Medusa

    def parse(self, data, mode):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS

        :return: A list of items found
        """
        items = []

        with BS4Parser(data, 'html5lib') as html:
            table_body = html.find('ul', class_='search-ret-list')

            # Continue only if at least one release is found
            if not table_body:
                log.debug(
                    'Data returned from provider does not contain any torrents'
                )
                return items

            torrent_rows = table_body.find_all('li', class_='search-ret-item')
            for row in torrent_rows:
                try:

                    title = row.find('h2').find('a').get('title')
                    download_url = row.find('div').find('a').get(
                        'href') + self._custom_trackers
                    if not all([title, download_url]):
                        continue

                    spans = row.find('div').find_all('span')

                    seeders = leechers = 0

                    torrent_size = spans[0].get_text()
                    size = convert_size(torrent_size, default=-1)

                    torrent_pubdate = spans[2].get_text()
                    pubdate = self.parse_pubdate(torrent_pubdate)

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                        'pubdate': pubdate,
                    }

                    if mode != 'RSS':
                        log.debug(
                            'Found result: {0} with {1} seeders and {2} leechers',
                            title, seeders, leechers)

                    items.append(item)
                except (AttributeError, TypeError, KeyError, ValueError,
                        IndexError):
                    log.exception('Failed parsing provider.')

        return items

Example #12

0

Show file

File: nebulance.py Project: 5l1v3r1/Medusa-2

    def parse(self, data, mode):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS

        :return: A list of items found
        """
        items = []

        with BS4Parser(data, 'html5lib') as html:
            torrent_table = html.find('table', {'id': 'torrent_table'})

            # Continue only if at least one release is found
            if not torrent_table:
                log.debug(
                    'Data returned from provider does not contain any torrents'
                )
                return items

            torrent_rows = torrent_table('tr', {'class': 'torrent'})

            # Continue only if one Release is found
            if not torrent_rows:
                log.debug(
                    'Data returned from provider does not contain any torrents'
                )
                return items

            for row in torrent_rows:
                try:
                    freeleech = row.find('img', alt='Freeleech') is not None
                    if self.freeleech and not freeleech:
                        continue

                    download_item = row.find(
                        'a',
                        {
                            'title': [
                                'Download Torrent',  # Download link
                                'Previously Grabbed Torrent File',  # Already Downloaded
                                'Currently Seeding Torrent',  # Seeding
                                'Currently Leeching Torrent',  # Leeching
                            ]
                        })

                    if not download_item:
                        continue

                    download_url = urljoin(self.url, download_item['href'])

                    temp_anchor = row.find('a', {'data-src': True})
                    title = temp_anchor['data-src']
                    if not all([title, download_url]):
                        continue

                    cells = row('td')
                    seeders = try_int(cells[5].text.strip())
                    leechers = try_int(cells[6].text.strip())

                    # Filter unseeded torrent
                    if seeders < self.minseed:
                        if mode != 'RSS':
                            log.debug(
                                "Discarding torrent because it doesn't meet the"
                                ' minimum seeders: {0}. Seeders: {1}', title,
                                seeders)
                        continue

                    torrent_size = cells[2].find('div').get_text(strip=True)
                    size = convert_size(torrent_size) or -1

                    pubdate_raw = cells[3].find('span')['title']
                    pubdate = self.parse_pubdate(pubdate_raw)

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                        'pubdate': pubdate,
                    }
                    if mode != 'RSS':
                        log.debug(
                            'Found result: {0} with {1} seeders and {2} leechers',
                            title, seeders, leechers)

                    items.append(item)
                except (AttributeError, TypeError, KeyError, ValueError,
                        IndexError):
                    log.exception('Failed parsing provider.')

        return items

Example #13

0

Show file

    def parse(self, data, mode):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS

        :return: A list of items found
        """
        items = []

        with BS4Parser(data, 'html5lib') as html:
            torrent_rows = html('item')

            for row in torrent_rows:
                try:
                    if row.category and 'video' not in row.category.get_text(
                            strip=True).lower():
                        continue

                    title_raw = row.title.text
                    # Add "-" after codec and add missing "."
                    title = re.sub(
                        r'([xh][ .]?264|xvid)( )', r'\1-', title_raw).replace(
                            ' ', '.') if title_raw else ''
                    info_hash = row.guid.text.rsplit('/', 1)[-1]
                    download_url = 'magnet:?xt=urn:btih:' + info_hash + '&dn=' + title + self._custom_trackers
                    if not all([title, download_url]):
                        continue

                    torrent_size, seeders, leechers = self._split_description(
                        row.find('description').text)
                    size = convert_size(torrent_size) or -1

                    pubdate_raw = row.pubdate.get_text()
                    pubdate = self.parse_pubdate(pubdate_raw)

                    # Filter unseeded torrent
                    if seeders < min(self.minseed, 1):
                        if mode != 'RSS':
                            log.debug(
                                "Discarding torrent because it doesn't meet the"
                                ' minimum seeders: {0}. Seeders: {1}', title,
                                seeders)
                        continue

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                        'pubdate': pubdate,
                    }
                    if mode != 'RSS':
                        log.debug(
                            'Found result: {0} with {1} seeders and {2} leechers',
                            title, seeders, leechers)

                    items.append(item)
                except (AttributeError, TypeError, KeyError, ValueError,
                        IndexError):
                    log.exception('Failed parsing provider.')

        return items