Python get_urls_asyncの例、feedcrawler.url.get_urls_async Pythonの例

コード例 #1

0

ファイルを表示

def by_feed_enricher(self, content):
    base_url = "https://" + CrawlerConfig('Hostnames',
                                          self.configfile).get('by')
    content = BeautifulSoup(content, 'lxml')
    posts = content.findAll("a",
                            href=re.compile("/category/"),
                            text=re.compile("Download"))
    async_results = []
    for post in posts:
        try:
            async_results.append(base_url + post['href'])
        except:
            pass
    async_results = get_urls_async(async_results, self.configfile, self.dbfile,
                                   self.scraper)
    results = async_results[0]

    entries = []
    if results:
        for result in results:
            try:
                content = []
                details = BeautifulSoup(result, 'lxml').findAll(
                    "td", {
                        "valign": "TOP",
                        "align": "CENTER"
                    })[1]
                title = details.find("small").text
                published = details.find("th", {"align": "RIGHT"}).text
                try:
                    imdb = details.find("a", href=re.compile("imdb.com"))
                    imdb_link = imdb["href"].replace("https://anonym.to/?", "")
                    imdb_score = imdb.text.replace(" ", "").replace("/10", "")
                    if "0.0" in imdb_score:
                        imdb_score = "9.9"
                    content.append('<a href="' + imdb_link + '"' + imdb_score +
                                   '</a>')
                except:
                    pass
                links = details.find_all("iframe")
                for link in links:
                    content.append('href="' + link["src"] + '"')

                content = "".join(content)

                entries.append(
                    FakeFeedParserDict({
                        "title":
                        title,
                        "published":
                        published,
                        "content":
                        [FakeFeedParserDict({"value": content + " mkv"})]
                    }))
            except:
                pass

    feed = {"entries": entries}
    feed = FakeFeedParserDict(feed)
    return feed

コード例 #2

0

ファイルを表示

def nk_feed_enricher(self, content):
    base_url = "https://" + CrawlerConfig('Hostnames',
                                          self.configfile).get('nk')
    content = BeautifulSoup(content, 'lxml')
    posts = content.findAll("a", {"class": "btn"},
                            href=re.compile("/release/"))
    async_results = []
    for post in posts:
        try:
            async_results.append(base_url + post['href'])
        except:
            pass
    async_results = get_urls_async(async_results, self.configfile, self.dbfile,
                                   self.scraper)[0]

    entries = []
    if async_results:
        for result in async_results:
            try:
                content = []
                details = BeautifulSoup(result,
                                        'lxml').find("div",
                                                     {"class": "article"})
                title = details.find("span", {"class": "subtitle"}).text
                published = details.find("p", {"class": "meta"}).text
                content.append("mkv ")
                try:
                    imdb = details.find("a",
                                        href=re.compile("imdb.com"))["href"]
                    content.append('<a href="' + imdb + '" 9,9</a>')
                except:
                    pass
                links = details.find_all("a", href=re.compile("/go/"))
                for link in links:
                    content.append('href="' + base_url + link["href"] + '">' +
                                   link.text + '<')
                content = "".join(content)

                entries.append(
                    FakeFeedParserDict({
                        "title":
                        title,
                        "published":
                        published,
                        "content": [FakeFeedParserDict({"value": content})]
                    }))
            except:
                pass

    feed = {"entries": entries}
    feed = FakeFeedParserDict(feed)
    return feed

コード例 #3

0

ファイルを表示

def by_get_download_links(self, content, title):
    async_link_results = re.findall(r'href="([^"\'>]*)"', content)
    async_link_results = get_urls_async(async_link_results, self.configfile,
                                        self.dbfile, self.scraper)

    content = []
    links = async_link_results[0]
    for link in links:
        link = BeautifulSoup(link, 'lxml').find("a",
                                                href=re.compile("/go\.php\?"))
        try:
            content.append('href="' + link["href"] + '">' +
                           link.text.replace(" ", "") + '<')
        except:
            pass

    content = "".join(content)
    download_links = get_download_links(self, content, title)
    return download_links

コード例 #4

0

ファイルを表示

def by_page_download_link(self, download_link, key):
    unused_get_feed_parameter(key)
    by = self.hostnames.get('by')
    download_link = get_url(download_link, self.configfile, self.dbfile)
    soup = BeautifulSoup(download_link, 'lxml')
    links = soup.find_all("iframe")
    async_link_results = []
    for link in links:
        link = link["src"]
        if 'https://' + by in link:
            async_link_results.append(link)
    async_link_results = get_urls_async(async_link_results, self.configfile,
                                        self.dbfile)
    links = async_link_results[0]
    url_hosters = []
    for link in links:
        if link:
            link = BeautifulSoup(link,
                                 'lxml').find("a",
                                              href=re.compile("/go\.php\?"))
            if link:
                url_hosters.append([link["href"], link.text.replace(" ", "")])
    return check_download_links(self, url_hosters)

コード例 #5

0

ファイルを表示

ファイル: search.py プロジェクト: Crabtronic/FeedCrawler

def get(title, configfile, dbfile, bl_only=False, sj_only=False):
    hostnames = CrawlerConfig('Hostnames', configfile)
    by = hostnames.get('by')
    dw = hostnames.get('dw')
    fx = hostnames.get('fx')
    nk = hostnames.get('nk')
    sj = hostnames.get('sj')

    specific_season = re.match(r'^(.*),(s\d{1,3})$', title.lower())
    specific_episode = re.match(r'^(.*),(s\d{1,3}e\d{1,3})$', title.lower())
    if specific_season:
        split = title.split(",")
        title = split[0]
        special = split[1].upper()
    elif specific_episode:
        split = title.split(",")
        title = split[0]
        special = split[1].upper()
    else:
        special = None

    bl_final = {}
    sj_final = {}
    scraper = cloudscraper.create_scraper()

    if not sj_only:
        mb_query = sanitize(title).replace(" ", "+")
        if special:
            bl_query = mb_query + "+" + special
        else:
            bl_query = mb_query

        unrated = []

        config = CrawlerConfig('ContentAll', configfile)
        quality = config.get('quality')
        ignore = config.get('ignore')

        if "480p" not in quality:
            search_quality = "+" + quality
        else:
            search_quality = ""

        if by:
            by_search = 'https://' + by + '/?q=' + bl_query + search_quality
        else:
            by_search = None
        if dw:
            dw_search = 'https://' + dw + '/?kategorie=Movies&search=' + bl_query + search_quality
        else:
            dw_search = None
        if fx:
            fx_search = 'https://' + fx + '/?s=' + bl_query
        else:
            fx_search = None

        async_results = get_urls_async([by_search, dw_search, fx_search],
                                       configfile, dbfile, scraper)
        scraper = async_results[1]
        async_results = async_results[0]

        by_results = []
        dw_results = []
        fx_results = []

        for res in async_results:
            if check_is_site(res, configfile) == 'BY':
                by_results = by_search_results(res, by)
            elif check_is_site(res, configfile) == 'DW':
                dw_results = dw_search_results(res, dw)
            elif check_is_site(res, configfile) == 'FX':
                fx_results = fx_search_results(fx_content_to_soup(res),
                                               configfile, dbfile, scraper)

        if nk:
            nk_search = post_url(
                'https://' + nk + "/search",
                configfile,
                dbfile,
                data={'search': bl_query.replace("+", " ") + " " + quality})
            nk_results = nk_search_results(nk_search, 'https://' + nk + '/')
        else:
            nk_results = []

        password = by
        for result in by_results:
            if "480p" in quality:
                if "720p" in result[0].lower() or "1080p" in result[0].lower() or "1080i" in result[
                    0].lower() or "2160p" in \
                        result[0].lower() or "complete.bluray" in result[0].lower() or "complete.mbluray" in result[
                    0].lower() or "complete.uhd.bluray" in result[0].lower():
                    continue
            if "xxx" not in result[0].lower():
                unrated.append([
                    rate(result[0], ignore),
                    encode_base64(result[1] + "|" + password),
                    result[0] + " (BY)"
                ])

        password = dw
        for result in dw_results:
            if "480p" in quality:
                if "720p" in result[0].lower() or "1080p" in result[0].lower() or "1080i" in result[
                    0].lower() or "2160p" in \
                        result[0].lower() or "complete.bluray" in result[0].lower() or "complete.mbluray" in result[
                    0].lower() or "complete.uhd.bluray" in result[0].lower():
                    continue
            unrated.append([
                rate(result[0], ignore),
                encode_base64(result[1] + "|" + password), result[0] + " (DW)"
            ])

        password = fx.split('.')[0]
        for result in fx_results:
            if "480p" in quality:
                if "720p" in result[0].lower() or "1080p" in result[0].lower() or "1080i" in result[
                    0].lower() or "2160p" in \
                        result[0].lower() or "complete.bluray" in result[0].lower() or "complete.mbluray" in result[
                    0].lower() or "complete.uhd.bluray" in result[0].lower():
                    continue
            if "-low" not in result[0].lower():
                unrated.append([
                    rate(result[0], ignore),
                    encode_base64(result[1] + "|" + password),
                    result[0] + " (FX)"
                ])

        password = nk.split('.')[0].capitalize()
        for result in nk_results:
            if "480p" in quality:
                if "720p" in result[0].lower() or "1080p" in result[0].lower() or "1080i" in result[
                    0].lower() or "2160p" in \
                        result[0].lower() or "complete.bluray" in result[0].lower() or "complete.mbluray" in result[
                    0].lower() or "complete.uhd.bluray" in result[0].lower():
                    continue
            unrated.append([
                rate(result[0], ignore),
                encode_base64(result[1] + "|" + password), result[0] + " (NK)"
            ])

        rated = sorted(unrated, reverse=True)

        results = {}
        i = 0

        for result in rated:
            res = {"payload": result[1], "title": result[2]}
            results["result" + str(i + 1000)] = res
            i += 1
        bl_final = results

    if not bl_only:
        if sj:
            sj_query = sanitize(title).replace(" ", "+")
            sj_search = get_url(
                'https://' + sj + '/serie/search?q=' + sj_query, configfile,
                dbfile, scraper)
            try:
                sj_results = BeautifulSoup(sj_search, 'lxml').findAll(
                    "a", href=re.compile("/serie"))
            except:
                sj_results = []
        else:
            sj_results = []

        if special:
            append = " (" + special + ")"
        else:
            append = ""
        i = 0
        results = {}
        for result in sj_results:
            r_title = result.text
            r_rating = fuzz.ratio(title.lower(), r_title)
            if r_rating > 40:
                res = {
                    "payload":
                    encode_base64(result['href'] + "|" + r_title + "|" +
                                  str(special)),
                    "title":
                    r_title + append
                }
                results["result" + str(i + 1000)] = res
                i += 1
        sj_final = results

    return bl_final, sj_final

コード例 #6

0

ファイルを表示

ファイル: content_all.py プロジェクト: Crabtronic/FeedCrawler

def download(payload, device, configfile, dbfile):
    config = CrawlerConfig('ContentAll', configfile)
    db = FeedDb(dbfile, 'FeedCrawler')
    hostnames = CrawlerConfig('Hostnames', configfile)
    by = hostnames.get('by')
    nk = hostnames.get('nk')

    payload = decode_base64(payload).split("|")
    link = payload[0]
    password = payload[1]

    site = check_is_site(link, configfile)
    if not site:
        return False
    elif "DW" in site:
        download_method = add_decrypt_instead_of_download
        download_links = [link]
        key = payload[1]
        password = payload[2]
    else:
        url = get_url(link, configfile, dbfile)
        if not url or "NinjaFirewall 429" in url:
            return False
        download_method = myjd_download
        soup = BeautifulSoup(url, 'lxml')

        if "BY" in site:
            key = soup.find("small").text
            links = soup.find_all("iframe")
            async_link_results = []
            for link in links:
                link = link["src"]
                if 'https://' + by in link:
                    async_link_results.append(link)
            async_link_results = get_urls_async(async_link_results, configfile,
                                                dbfile)
            links = async_link_results[0]
            url_hosters = []
            for link in links:
                if link:
                    link = BeautifulSoup(link, 'lxml').find(
                        "a", href=re.compile("/go\.php\?"))
                    if link:
                        url_hosters.append(
                            [link["href"],
                             link.text.replace(" ", "")])
        elif "NK" in site:
            key = soup.find("span", {"class": "subtitle"}).text
            url_hosters = []
            hosters = soup.find_all("a", href=re.compile("/go/"))
            for hoster in hosters:
                url_hosters.append(
                    ['https://' + nk + hoster["href"], hoster.text])
        elif "FX" in site:
            key = payload[1]
            password = payload[2]
        else:
            return False

        links = {}
        if "FX" in site:

            class FX:
                configfile = ""

            FX.configfile = configfile
            download_links = fx_get_download_links(FX, url, key)
        else:
            for url_hoster in reversed(url_hosters):
                try:
                    link_hoster = url_hoster[1].lower().replace(
                        'target="_blank">',
                        '').replace(" ", "-").replace("ddownload", "ddl")
                    if check_hoster(link_hoster, configfile):
                        link = url_hoster[0]
                        if by in link:
                            demasked_link = get_redirected_url(
                                link, configfile, dbfile, False)
                            if demasked_link:
                                link = demasked_link
                        links[link_hoster] = link
                except:
                    pass
            if config.get("hoster_fallback") and not links:
                for url_hoster in reversed(url_hosters):
                    link_hoster = url_hoster[1].lower().replace(
                        'target="_blank">',
                        '').replace(" ", "-").replace("ddownload", "ddl")
                    link = url_hoster[0]
                    if by in link:
                        demasked_link = get_redirected_url(
                            link, configfile, dbfile, False)
                        if demasked_link:
                            link = demasked_link
                    links[link_hoster] = link
            download_links = list(links.values())

    englisch = False
    if "*englisch" in key.lower() or "*english" in key.lower():
        key = key.replace('*ENGLISCH', '').replace("*Englisch", "").replace(
            "*ENGLISH", "").replace("*English", "").replace("*", "")
        englisch = True

    staffel = re.search(r"s\d{1,2}(-s\d{1,2}|-\d{1,2}|\.)", key.lower())

    if download_links:
        if staffel:
            if download_method(configfile, dbfile, device, key, "FeedCrawler",
                               download_links, password):
                db.store(
                    key.replace(".COMPLETE", "").replace(".Complete", ""),
                    'notdl' if config.get('enforcedl')
                    and '.dl.' not in key.lower() else 'added')
                log_entry = '[Suche/Staffel] - ' + key.replace(
                    ".COMPLETE", "").replace(".Complete",
                                             "") + ' - [' + site + ']'
                logger.info(log_entry)
                notify([log_entry], configfile)
                return True
        else:
            retail = False
            if config.get('cutoff') and '.COMPLETE.' not in key.lower():
                if is_retail(key, dbfile):
                    retail = True
            if download_method(configfile, dbfile, device, key, "FeedCrawler",
                               download_links, password):
                db.store(
                    key, 'notdl' if config.get('enforcedl')
                    and '.dl.' not in key.lower() else 'added')
                log_entry = '[Suche/Film' + (
                    '/Englisch' if englisch and not retail else
                    '') + ('/Englisch/Retail' if englisch and retail else
                           '') + ('/Retail' if not englisch and retail else
                                  '') + '] - ' + key + ' - [' + site + ']'
                logger.info(log_entry)
                notify([log_entry], configfile)
                return [key]
    else:
        return False

コード例 #7

0

ファイルを表示

def get_search_results(self, bl_query):
    hostnames = CrawlerConfig('Hostnames', self.configfile)
    by = hostnames.get('by')
    dw = hostnames.get('dw')
    fx = hostnames.get('fx')
    nk = hostnames.get('nk')

    search_results = []

    config = CrawlerConfig('ContentAll', self.configfile)
    quality = config.get('quality')

    if "480p" not in quality:
        search_quality = "+" + quality
    else:
        search_quality = ""

    if by:
        by_search = 'https://' + by + '/?q=' + bl_query + search_quality
    else:
        by_search = None
    if dw:
        dw_search = 'https://' + dw + '/?kategorie=Movies&search=' + bl_query + search_quality
    else:
        dw_search = None
    if fx:
        fx_search = 'https://' + fx + '/?s=' + bl_query
    else:
        fx_search = None

    async_results = get_urls_async([by_search, dw_search, fx_search],
                                   self.configfile, self.dbfile, self.scraper)
    scraper = async_results[1]
    async_results = async_results[0]

    by_results = []
    dw_results = []
    fx_results = []

    for res in async_results:
        if check_is_site(res, self.configfile) == 'BY':
            by_results = by_search_results(res, by)
        elif check_is_site(res, self.configfile) == 'DW':
            dw_results = dw_search_results(res, dw)
        elif check_is_site(res, self.configfile) == 'FX':
            fx_results = fx_search_results(fx_content_to_soup(res),
                                           self.configfile, self.dbfile,
                                           scraper)

    if nk:
        nk_search = post_url(
            'https://' + nk + "/search",
            self.configfile,
            self.dbfile,
            data={'search': bl_query.replace("+", " ") + " " + quality})
        nk_results = nk_search_results(nk_search, 'https://' + nk + '/')
    else:
        nk_results = []

    password = by
    for result in by_results:
        if "480p" in quality:
            if "720p" in result[0].lower() or "1080p" in result[0].lower() or "1080i" in result[
                0].lower() or "2160p" in \
                    result[0].lower() or "complete.bluray" in result[0].lower() or "complete.mbluray" in result[
                0].lower() or "complete.uhd.bluray" in result[0].lower():
                continue
        if "xxx" not in result[0].lower():
            search_results.append([result[0], result[1] + "|" + password])

    password = dw
    for result in dw_results:
        if "480p" in quality:
            if "720p" in result[0].lower() or "1080p" in result[0].lower() or "1080i" in result[
                0].lower() or "2160p" in \
                    result[0].lower() or "complete.bluray" in result[0].lower() or "complete.mbluray" in result[
                0].lower() or "complete.uhd.bluray" in result[0].lower():
                continue
        search_results.append([result[0], result[1] + "|" + password])

    for result in fx_results:
        if "480p" in quality:
            if "720p" in result[0].lower() or "1080p" in result[0].lower() or "1080i" in result[
                0].lower() or "2160p" in \
                    result[0].lower() or "complete.bluray" in result[0].lower() or "complete.mbluray" in result[
                0].lower() or "complete.uhd.bluray" in result[0].lower():
                continue
        if "-low" not in result[0].lower():
            search_results.append([result[0], result[1]])

        password = nk.split('.')[0].capitalize()
    for result in nk_results:
        if "480p" in quality:
            if "720p" in result[0].lower() or "1080p" in result[0].lower() or "1080i" in result[
                0].lower() or "2160p" in \
                    result[0].lower() or "complete.bluray" in result[0].lower() or "complete.mbluray" in result[
                0].lower() or "complete.uhd.bluray" in result[0].lower():
                continue
        search_results.append([result[0], result[1] + "|" + password])

    return search_results

コード例 #8

0

ファイルを表示

def dw_feed_enricher(self, content):
    base_url = "https://" + CrawlerConfig('Hostnames',
                                          self.configfile).get('dw')
    content = BeautifulSoup(content, 'lxml')
    posts = content.findAll("a", href=re.compile("download/"))
    href_by_id = {}
    async_results = []
    for post in posts:
        try:
            post_id = post['href'].replace("download/", "").split("/")[0]
            post_link = base_url + "/" + post['href']
            post_hosters = post.parent.findAll(
                "img", src=re.compile(r"images/icon_hoster"))
            hosters = []
            for hoster in post_hosters:
                hosters.append(hoster["title"].replace("Download bei ", ""))
            hosters = "|".join(hosters)
            href_by_id[post_id] = {"hosters": hosters, "link": post_link}
            async_results.append(post_link)
        except:
            pass
    async_results = get_urls_async(async_results, self.configfile, self.dbfile,
                                   self.scraper)
    results = async_results[0]

    entries = []
    if results:
        for result in results:
            try:
                content = []
                details = BeautifulSoup(result, 'lxml')
                title = details.title.text.split(' //')[0].replace(
                    "*mirror*", "").strip()
                post_id = details.find(
                    "a",
                    {"data-warezkorb": re.compile(r"\d*")})["data-warezkorb"]
                details = details.findAll("div", {"class": "row"})[3]
                published = details.findAll("td")[1].text.replace("Datum", "")
                try:
                    imdb = details.findAll("td")[6].find("a")
                    imdb_link = imdb["href"]
                    imdb_score = imdb.find("b").text.replace(" ", "").replace(
                        "/10", "")
                    if "0.0" in imdb_score:
                        imdb_score = "9.9"
                    content.append('<a href="' + imdb_link + '"' + imdb_score +
                                   '</a>')
                except:
                    pass

                content.append('DOWNLOADLINK="' + href_by_id[post_id]["link"] +
                               '"')
                content.append('HOSTERS="' + href_by_id[post_id]["hosters"] +
                               '"')

                content = "".join(content)

                entries.append(
                    FakeFeedParserDict({
                        "title":
                        title,
                        "published":
                        published,
                        "content":
                        [FakeFeedParserDict({"value": content + " mkv"})]
                    }))
            except:
                pass

    feed = {"entries": entries}
    feed = FakeFeedParserDict(feed)
    return feed