def notify(added_items): notifications = RssConfig('Notifications') homeassistant_settings = notifications.get("homeassistant").split(',') pushbullet_token = notifications.get("pushbullet") pushover_settings = notifications.get("pushover").split(',') items = [] for item in added_items: item = item.replace('[<a href="', '').replace('" target="_blank">Link</a>]', '') items.append(item) if len(items) > 0: cut_items = list(api_request_cutter(items, 5)) if len(notifications.get("homeassistant")) > 0: for cut_item in cut_items: homassistant_url = homeassistant_settings[0] homeassistant_password = homeassistant_settings[1] Homeassistant(cut_item, homassistant_url, homeassistant_password) if len(notifications.get("pushbullet")) > 0: Pushbullet(items, pushbullet_token) if len(notifications.get('pushover')) > 0: for cut_item in cut_items: pushover_user = pushover_settings[0] pushover_token = pushover_settings[1] Pushover(cut_item, pushover_user, pushover_token)
def __init__(self): self.config = RssConfig(self._INTERNAL_NAME) self.log_info = logging.info self.log_error = logging.error self.log_debug = logging.debug list([ _mkdir_p(os.path.dirname(self.config.get(f))) for f in ['db_file', 'file'] ]) _mkdir_p(self.config.get('crawljob_directory')) self.db = RssDb(self.config.get('db_file')) self._periodical_active = False self.periodical = RepeatableTimer( int(self.config.get('interval')) * 60, self.periodical_task)
def getURL(url): proxy = RssConfig('RSScrawler').get('proxy') if proxy: proxies = [] if proxy.startswith('http://'): proxies[0] = proxy[:4] proxies[1] = proxy elif proxy.startswith('https://'): proxies[0] = proxy[:5] proxies[1] = proxy elif proxy.startswith('socks5://'): proxies[0] = 'http' proxies[1] = proxy proxies = {proxies[0]: proxies[1]} scraper = cfscrape.create_scraper(delay=10, proxies=proxies) else: scraper = cfscrape.create_scraper(delay=10) return scraper.get(url).content
def __init__(self): self.config = RssConfig(self._INTERNAL_NAME) self.log_info = logging.info self.log_error = logging.error self.log_debug = logging.debug list([_mkdir_p(os.path.dirname(self.config.get(f))) for f in ['db_file', 'file']]) _mkdir_p(self.config.get('crawljob_directory')) self.db = RssDb(self.config.get('db_file')) self._periodical_active = False self.periodical = RepeatableTimer( int(self.config.get('interval')) * 60, self.periodical_task )
def write_crawljob_file(package_name, folder_name, link_text, crawljob_dir, subdir): crawljob_file = crawljob_dir + '/%s.crawljob' % unicode(re.sub('[^\w\s\.-]', '', package_name.replace(' ', '')).strip().lower()) crawljobs = RssConfig('Crawljobs') autostart = crawljobs.get("autostart") usesubdir = crawljobs.get("subdir") if not usesubdir: subdir = "" if autostart: autostart = "TRUE" else: autostart = "FALSE" try: file = open(crawljob_file, 'w') file.write('enabled=TRUE\n') file.write('autoStart=' + autostart + '\n') file.write('extractPasswords=["' + "bW92aWUtYmxvZy5vcmc=".decode('base64') + '","' + "c2VyaWVuanVua2llcy5vcmc=".decode('base64') + '","' + "aGQtYXJlYS5vcmc=".decode('base64') + '","' + "aGQtd29ybGQub3Jn".decode('base64') + '","' + "d2FyZXotd29ybGQub3Jn".decode('base64') + '"]\n') file.write('downloadPassword='******'base64') + '\n') file.write('extractAfterDownload=TRUE\n') file.write('forcedStart=' + autostart + '\n') file.write('autoConfirm=' + autostart + '\n') if not subdir == "": file.write('downloadFolder=' + subdir + "/" + '%s\n' % folder_name) if subdir == "RSScrawler/Remux": file.write('priority=Lower\n') else: file.write('downloadFolder=' + '%s\n' % folder_name) file.write('packageName=%s\n' % package_name.replace(' ', '')) file.write('text=%s\n' % link_text) file.close() return True except UnicodeEncodeError as e: file.close() log_error("Beim Schreibversuch des Crawljobs: %s FEHLER: %s" %(crawljob_file, e.message)) if os.path.isfile(crawljob_file): log_info("Entferne defekten Crawljob: %s" % crawljob_file) os.remove(crawljob_file) return False
def notify(added_items): notifications = RssConfig('Notifications') homeassistant_settings = notifications.get("homeassistant").split(',') pushbullet_token = notifications.get("pushbullet") pushover_settings = notifications.get("pushover").split(',') items = [] for item in added_items: item = re.sub(r' - <a href.*<\/a>', '', item) items.append(item) if len(items) > 0: cut_items = list(api_request_cutter(items, 5)) if len(notifications.get("homeassistant")) > 0: for cut_item in cut_items: homassistant_url = homeassistant_settings[0] homeassistant_password = homeassistant_settings[1] Homeassistant(cut_item, homassistant_url, homeassistant_password) if len(notifications.get("pushbullet")) > 0: Pushbullet(items, pushbullet_token) if len(notifications.get('pushover')) > 0: for cut_item in cut_items: pushover_user = pushover_settings[0] pushover_token = pushover_settings[1] Pushover(cut_item, pushover_user, pushover_token)
def get_all(): if request.method == 'GET': general = RssConfig('RSScrawler') alerts = RssConfig('Notifications') crawljobs = RssConfig('Crawljobs') mb = RssConfig('MB') sj = RssConfig('SJ') yt = RssConfig('YT') ver = version.getVersion() if version.updateCheck()[0]: updateready = True updateversion = version.updateCheck()[1] print( 'Update steht bereit (' + updateversion + ')! Weitere Informationen unter https://github.com/rix1337/RSScrawler/releases/latest' ) else: updateready = False log = '' logfile = os.path.join(os.path.dirname(sys.argv[0]), 'RSScrawler.log') if os.path.isfile(logfile): logfile = open(os.path.join(logfile)) output = StringIO.StringIO() for line in reversed(logfile.readlines()): output.write("<p>" + line.replace("\n", "</p>")) log = output.getvalue() return jsonify({ "version": { "ver": ver, "update_ready": updateready, "docker": docker, }, "log": log, "lists": { "mb": { "filme": getListe('MB_Filme'), "filme3d": getListe('MB_3D'), "regex": getListe('MB_Regex'), }, "sj": { "serien": getListe('SJ_Serien'), "regex": getListe('SJ_Serien_Regex'), "staffeln_regex": getListe('SJ_Staffeln_Regex'), }, "mbsj": { "staffeln": getListe('MB_Staffeln'), }, "yt": { "kanaele_playlisten": getListe('YT_Channels'), }, }, "settings": { "general": { "pfad": general.get("jdownloader"), "port": to_int(general.get("port")), "prefix": general.get("prefix"), "interval": to_int(general.get("interval")), "english": bool(general.get("english")), "hoster": general.get("hoster"), }, "alerts": { "homeassistant": alerts.get("homeassistant"), "pushbullet": alerts.get("pushbullet"), "pushover": alerts.get("pushover"), }, "crawljobs": { "autostart": bool(crawljobs.get("autostart")), "subdir": bool(crawljobs.get("subdir")), }, "mb": { "quality": mb.get("quality"), "ignore": mb.get("ignore"), "regex": bool(mb.get("regex")), "imdb_score": to_float(mb.get("imdb")), "imdb_year": to_int(mb.get("imdbyear")), "historical": bool(mb.get("historical")), "force_dl": bool(mb.get("enforcedl")), "cutoff": bool(mb.get("cutoff")), "crawl_3d": bool(mb.get("crawl3d")), }, "sj": { "quality": sj.get("quality"), "ignore": sj.get("rejectlist"), "regex": bool(sj.get("regex")), }, "mbsj": { "enabled": bool(mb.get("crawlseasons")), "quality": mb.get("seasonsquality"), "packs": bool(mb.get("seasonpacks")), "source": mb.get("seasonssource"), }, "yt": { "enabled": bool(yt.get("youtube")), "max": to_int(yt.get("maxvideos")), "ignore": yt.get("ignore"), } } }) else: return "Failed", 405
def get_post_settings(): if request.method == 'GET': general = RssConfig('RSScrawler') alerts = RssConfig('Notifications') crawljobs = RssConfig('Crawljobs') mb = RssConfig('MB') sj = RssConfig('SJ') yt = RssConfig('YT') return jsonify({ "settings": { "general": { "pfad": general.get("jdownloader"), "port": to_int(general.get("port")), "prefix": general.get("prefix"), "interval": to_int(general.get("interval")), "english": bool(general.get("english")), "hoster": general.get("hoster"), }, "alerts": { "homeassistant": alerts.get("homeassistant"), "pushbullet": alerts.get("pushbullet"), "pushover": alerts.get("pushover"), }, "crawljobs": { "autostart": bool(crawljobs.get("autostart")), "subdir": bool(crawljobs.get("subdir")), }, "mb": { "quality": mb.get("quality"), "ignore": mb.get("ignore"), "regex": bool(mb.get("regex")), "imdb_score": to_float(mb.get("imdb")), "imdb_year": to_int(mb.get("imdbyear")), "historical": bool(mb.get("historical")), "force_dl": bool(mb.get("enforcedl")), "cutoff": bool(mb.get("cutoff")), "crawl_3d": bool(mb.get("crawl3d")), }, "sj": { "quality": sj.get("quality"), "ignore": sj.get("rejectlist"), "regex": bool(sj.get("regex")), }, "mbsj": { "enabled": bool(mb.get("crawlseasons")), "quality": mb.get("seasonsquality"), "packs": bool(mb.get("seasonpacks")), "source": mb.get("seasonssource"), }, "yt": { "enabled": bool(yt.get("youtube")), "max": to_int(yt.get("maxvideos")), "ignore": yt.get("ignore"), } } }) if request.method == 'POST': data = request.json with open( os.path.join(os.path.dirname(sys.argv[0]), 'Einstellungen/RSScrawler.ini'), 'wb') as f: f.write('# RSScrawler.ini (Stand: RSScrawler ' + version.getVersion() + ')\n') f.write("\n[RSScrawler]\n") f.write("jdownloader = " + to_str(data['general']['pfad']).encode('utf-8') + "\n") f.write("port = " + to_str(data['general']['port']).encode('utf-8') + "\n") f.write("prefix = " + to_str(data['general']['prefix']).encode('utf-8').lower() + "\n") interval = to_str(data['general']['interval']).encode('utf-8') if to_int(interval) < 3: interval = '3' f.write("interval = " + interval + "\n") f.write("english = " + to_str(data['general']['english']).encode('utf-8') + "\n") f.write("hoster = " + to_str(data['general']['hoster']).encode('utf-8') + "\n") f.write("\n[MB]\n") f.write("quality = " + to_str(data['mb']['quality']).encode('utf-8') + "\n") f.write("ignore = " + to_str(data['mb']['ignore']).encode('utf-8').lower() + "\n") f.write("historical = " + to_str(data['mb']['historical']).encode('utf-8') + "\n") f.write("regex = " + to_str(data['mb']['regex']).encode('utf-8') + "\n") f.write("cutoff = " + to_str(data['mb']['cutoff']).encode('utf-8') + "\n") f.write("crawl3d = " + to_str(data['mb']['crawl_3d']).encode('utf-8') + "\n") f.write("enforcedl = " + to_str(data['mb']['force_dl']).encode('utf-8') + "\n") f.write("crawlseasons = " + to_str(data['mbsj']['enabled']).encode('utf-8') + "\n") f.write("seasonsquality = " + to_str(data['mbsj']['quality']).encode('utf-8') + "\n") f.write("seasonpacks = " + to_str(data['mbsj']['packs']).encode('utf-8') + "\n") f.write("seasonssource = " + to_str(data['mbsj']['source']).encode('utf-8').lower() + "\n") f.write("imdbyear = " + to_str(data['mb']['imdb_year']).encode('utf-8') + "\n") imdb = to_str(data['mb']['imdb_score']).encode('utf-8') if re.match('[^0-9]', imdb): imdb = 0.0 elif imdb == '': imdb = 0.0 else: imdb = round( float( to_str( data['mb']['imdb_score']).encode('utf-8').replace( ",", ".")), 1) if imdb > 10: imdb = 10.0 f.write("imdb = " + to_str(imdb) + "\n") f.write("\n[SJ]\n") f.write("quality = " + to_str(data['sj']['quality']).encode('utf-8') + "\n") f.write("rejectlist = " + to_str(data['sj']['ignore']).encode('utf-8').lower() + "\n") f.write("regex = " + to_str(data['sj']['regex']).encode('utf-8') + "\n") f.write("\n[YT]\n") f.write("youtube = " + to_str(data['yt']['enabled']).encode('utf-8') + "\n") maxvideos = to_str(data['yt']['max']).encode('utf-8') if maxvideos == "": maxvideos = "10" if to_int(maxvideos) < 1: f.write("maxvideos = 1\n") elif to_int(maxvideos) > 50: f.write("maxvideos = 50\n") else: f.write("maxvideos = " + to_str(maxvideos) + "\n") f.write("ignore = " + to_str(data['yt']['ignore']).encode('utf-8') + "\n") f.write("\n[Notifications]\n") f.write("homeassistant = " + to_str(data['alerts']['homeassistant']).encode('utf-8') + "\n") f.write("pushbullet = " + to_str(data['alerts']['pushbullet']).encode('utf-8') + "\n") f.write("pushover = " + to_str(data['alerts']['pushover']).encode('utf-8') + "\n") f.write("\n[Crawljobs]\n") f.write("autostart = " + to_str(data['crawljobs']['autostart']).encode('utf-8') + "\n") f.write("subdir = " + to_str(data['crawljobs']['subdir']).encode('utf-8') + "\n") files.check() return "Success", 201 else: return "Failed", 405
import version import StringIO import os import re import sys import logging app = Flask(__name__, static_url_path='/web', template_folder='web') if not os.path.exists( os.path.join(os.path.dirname(sys.argv[0]), 'Einstellungen')): prefix = "" else: general = RssConfig('RSScrawler') if general.get("prefix"): prefix = '/' + general.get("prefix") else: prefix = "" def to_int(i): i = i.strip().replace("None", "") return int(i) if i else "" def to_float(i): i = i.strip().replace("None", "") return float(i) if i else ""
class SJ(): MIN_CHECK_INTERVAL = 2 * 60 #2minutes _INTERNAL_NAME = 'SJ' def __init__(self): self.config = RssConfig(self._INTERNAL_NAME) self.log_info = logging.info self.log_error = logging.error self.log_debug = logging.debug list([ _mkdir_p(os.path.dirname(self.config.get(f))) for f in ['db_file', 'file'] ]) _mkdir_p(self.config.get('crawljob_directory')) self.db = RssDb(self.config.get('db_file')) self._periodical_active = False self.periodical = RepeatableTimer( int(self.config.get('interval')) * 60, self.periodical_task) def activate(self): self._periodical_active = True self.periodical.start() @_restart_timer def periodical_task(self): feed = feedparser.parse( 'http://serienjunkies.org/xml/feeds/episoden.xml') self.pattern = "|".join(getSeriesList(self.config.get("file"))).lower() reject = self.config.get("rejectlist").replace( ";", "|").lower() if len( self.config.get("rejectlist")) > 0 else "^unmatchable$" self.quality = self.config.get("quality") self.hoster = self.config.get("hoster") if self.hoster == "alle": self.hoster = "." self.added_items = [] for post in feed.entries: link = post.link title = post.title if self.config.get("regex"): m = re.search(self.pattern, title.lower()) if not m and not "720p" in title and not "1080p" in title: m = re.search(self.pattern.replace("480p", "."), title.lower()) self.quality = "480p" if m: if "720p" in title.lower(): self.quality = "720p" if "1080p" in title.lower(): self.quality = "1080p" m = re.search(reject, title.lower()) if m: self.log_debug("Rejected: " + title) continue title = re.sub('\[.*\] ', '', post.title) self.range_checkr(link, title) else: if self.config.get("quality") != '480p': m = re.search(self.pattern, title.lower()) if m: if self.config.get("language") in title: mm = re.search(self.quality, title.lower()) if mm: mmm = re.search(reject, title.lower()) if mmm: self.log_debug("Rejected: " + title) continue title = re.sub('\[.*\] ', '', post.title) self.range_checkr(link, title) else: m = re.search(self.pattern, title.lower()) if m: if self.config.get("language") in title: if "720p" in title.lower( ) or "1080p" in title.lower(): continue mm = re.search(reject, title.lower()) if mm: self.log_debug("Rejected: " + title) continue title = re.sub('\[.*\] ', '', post.title) self.range_checkr(link, title) if len(self.config.get('pushbulletapi')) > 2: notifyPushbulletSJ( self.config.get("pushbulletapi"), self.added_items) if len(self.added_items) > 0 else True def range_checkr(self, link, title): pattern = re.match(".*S\d{2}E\d{2}-\w?\d{2}.*", title) if pattern is not None: range0 = re.sub(r".*S\d{2}E(\d{2}-\w?\d{2}).*", r"\1", title).replace("E", "") number1 = re.sub(r"(\d{2})-\d{2}", r"\1", range0) number2 = re.sub(r"\d{2}-(\d{2})", r"\1", range0) title_cut = re.findall(r"(.*S\d{2}E)(\d{2}-\w?\d{2})(.*)", title) try: for count in range(int(number1), (int(number2) + 1)): NR = re.match("d\{2}", str(count)) if NR is not None: title1 = title_cut[0][0] + str( count) + ".*" + title_cut[0][-1] self.range_parse(link, title1) else: title1 = title_cut[0][0] + "0" + str( count) + ".*" + title_cut[0][-1] self.range_parse(link, title1) except ValueError as e: logging.error("Raised ValueError exception: %s" % e.message) else: self.parse_download(link, title) def range_parse(self, series_url, search_title): req_page = getURL(series_url) soup = BeautifulSoup(req_page) try: titles = soup.findAll(text=re.compile(search_title)) for title in titles: if self.quality != '480p' and self.quality in title: self.parse_download(series_url, title) if self.quality == '480p' and not (('.720p.' in title) or ('.1080p.' in title)): self.parse_download(series_url, title) except re.error as e: self.log_error('sre_constants.error: %s' % e) def parse_download(self, series_url, search_title): req_page = getURL(series_url) soup = BeautifulSoup(req_page) title = soup.find(text=re.compile(search_title)) if title: items = [] links = title.parent.parent.findAll('a') for link in links: url = link['href'] pattern = '.*%s_.*' % self.hoster if re.match(pattern, url): items.append(url) self.send_package(title, items) if len(items) > 0 else True def send_package(self, title, link): try: storage = self.db.retrieve(title) except Exception as e: self.log_debug("db.retrieve got exception: %s, title: %s" % (e, title)) if storage == 'downloaded': self.log_debug(title + " already downloaded") else: self.log_info("NEW RELEASE: " + title) self.db.store(title, 'downloaded') write_crawljob_file( title, title, link, self.config.get('crawljob_directory') ) and self.added_items.append(title.encode("utf-8"))
class MovieblogFeed(): FEED_URL = "http://www.movie-blog.org/feed/" SUBSTITUTE = "[&#\s/]" _INTERNAL_NAME = 'MB' def __init__(self): self.config = RssConfig(self._INTERNAL_NAME) self.log_info = logging.info self.log_error = logging.error self.log_debug = logging.debug list([ _mkdir_p(os.path.dirname(self.config.get(f))) for f in ['db_file', 'patternfile'] ]) _mkdir_p(self.config.get('crawljob_directory')) self.db = RssDb(self.config.get('db_file')) self._hosters_pattern = self.config.get('hoster').replace(';', '|') self._periodical_active = False self.periodical = RepeatableTimer( int(self.config.get('interval')) * 60, self.periodical_task) self.dictWithNamesAndLinks = {} def activate(self): self._periodical_active = True self.periodical.start() return self def readInput(self, file): if not os.path.isfile(file): open(file, "a").close() placeholder = open(file, 'w') placeholder.write( 'ADD ALL MOVIES YOU WANT TO CRAWL FOR AS NEW LINES IN THIS FILE\n' ) placeholder.close() try: f = codecs.open(file, "rb") return f.read().splitlines() except: self.log_error("Inputfile not found") def getPatterns(self, patterns, quality, rg, sf): return {line: (quality, rg, sf) for line in patterns} def searchLinks(self, feed): ignore = "|".join(["\.%s\." % p for p in self.config.get("ignore").lower().split(',') if not self.config.get('crawl3d') or p != '3d']) \ if not self.config.get("ignore") == "" else "^unmatchable$" for key in self.allInfos: s = re.sub(self.SUBSTITUTE, ".", key).lower() for post in feed.entries: """Search for title""" found = re.search(s, post.title.lower()) if found: """Check if we have to ignore it""" found = re.search(ignore, post.title.lower()) if found: self.log_debug("Ignoring [%s]" % post.title) continue """Search for quality""" ss = self.allInfos[key][0].lower() if '.3d.' in post.title.lower(): if self.config.get('crawl3d') and ( "1080p" in post.title.lower() or "1080i" in post.title.lower()): found = True else: continue else: if ss == "480p": if "720p" in post.title.lower( ) or "1080p" in post.title.lower( ) or "1080i" in post.title.lower(): continue found = True else: found = re.search(ss, post.title.lower()) if found: """Search for releasegroup""" sss = "[\.-]+" + self.allInfos[key][1].lower() found = re.search(sss, post.title.lower()) if self.allInfos[key][2]: # If all True, then found = True found = all([ word in post.title.lower() for word in self.allInfos[key][2] ]) if found: try: episode = re.search( r'([\w\.\s]*s\d{1,2}e\d{1,2})[\w\.\s]*', post.title.lower()).group(1) if "repack" in post.title.lower(): episode = episode + "-repack" self.log_debug( "TV-Series detected, will shorten its name to [%s]" % episode) yield (episode, [post.link], key) except: yield (post.title, [post.link], key) def _get_download_links(self, url, hosters_pattern=None): tree = html.fromstring(requests.get(url).content) xpath = '//*[@id="content"]/span/div/div[2]/p//strong[contains(text(),"Download:") or contains(text(),"Mirror #")]/following-sibling::a[1]' return [ common.get_first(link.xpath('./@href')) for link in tree.xpath(xpath) if hosters_pattern is None or re.search(hosters_pattern, link.text, flags=re.IGNORECASE) ] @_restart_timer def periodical_task(self): urls = [] text = [] dl = { key: ('.*', '.*', ('.dl.', )) for key in self.db.get_patterns('notdl') } self.allInfos = dict( set({ key: dl[key] if key in dl else value for (key, value) in self.getPatterns( self.readInput(self.config.get("patternfile")), self.config.get('quality'), '.*', None).items() }.items()) | set( self.getPatterns( self.readInput(self.config.get("seasonslist") ), self.config.get('seasonsquality'), '.*', ('.complete.', '.' + self.config.get('seasonssource') + '.')).items() if self.config.get('crawlseasons') else [])) if self.config.get("historical"): for xline in self.allInfos.keys(): if len(xline) > 0 and not xline.startswith("#"): xn = xline.split(",")[0].replace(".", " ").replace(" ", "+") urls.append( 'http://www.movie-blog.org/search/%s/feed/rss2/' % xn) else: urls.append(self.FEED_URL) for url in urls: for (key, value, pattern) in self.searchLinks(feedparser.parse(url)): if self.db.retrieve(key) == 'added' or self.db.retrieve( key) == 'notdl': self.log_debug("[%s] has already been added" % key) else: self.db.store( key, 'notdl' if self.config.get('enforcedl') and '.dl.' not in key.lower() else 'added', pattern) self.log_info("NEW RELEASE: " + key) download_link = [ common.get_first( self._get_download_links(value[0], self._hosters_pattern)) ] if any(download_link): write_crawljob_file( key, key, download_link, self.config.get( "crawljob_directory")) and text.append(key) if len(text) > 0: notifyPushbulletMB(self.config.get("pushbulletapi"), text)
class SJ(): MIN_CHECK_INTERVAL = 2 * 60 #2minutes _INTERNAL_NAME = 'SJ' def __init__(self): self.config = RssConfig(self._INTERNAL_NAME) self.log_info = logging.info self.log_error = logging.error self.log_debug = logging.debug list([_mkdir_p(os.path.dirname(self.config.get(f))) for f in ['db_file', 'file']]) _mkdir_p(self.config.get('crawljob_directory')) self.db = RssDb(self.config.get('db_file')) self._periodical_active = False self.periodical = RepeatableTimer( int(self.config.get('interval')) * 60, self.periodical_task ) def activate(self): self._periodical_active = True self.periodical.start() @_restart_timer def periodical_task(self): feed = feedparser.parse('http://serienjunkies.org/xml/feeds/episoden.xml') self.pattern = "|".join(getSeriesList(self.config.get("file"))).lower() reject = self.config.get("rejectlist").replace(";","|").lower() if len(self.config.get("rejectlist")) > 0 else "^unmatchable$" self.quality = self.config.get("quality") self.hoster = self.config.get("hoster") if self.hoster == "alle": self.hoster = "." self.added_items = [] for post in feed.entries: link = post.link title = post.title if str2bool(self.config.get("regex")): m = re.search(self.pattern,title.lower()) if not m and not "720p" in title and not "1080p" in title: m = re.search(self.pattern.replace("480p","."),title.lower()) self.quality = "480p" if m: if "720p" in title.lower(): self.quality = "720p" if "1080p" in title.lower(): self.quality = "1080p" m = re.search(reject,title.lower()) if m: self.log_debug("Rejected: " + title) continue title = re.sub('\[.*\] ', '', post.title) self.range_checkr(link,title) else: if self.config.get("quality") != '480p': m = re.search(self.pattern,title.lower()) if m: if self.config.get("language") in title: mm = re.search(self.quality,title.lower()) if mm: mmm = re.search(reject,title.lower()) if mmm: self.log_debug("Rejected: " + title) continue title = re.sub('\[.*\] ', '', post.title) self.range_checkr(link,title) else: m = re.search(self.pattern,title.lower()) if m: if self.config.get("language") in title: if "720p" in title.lower() or "1080p" in title.lower(): continue mm = re.search(reject,title.lower()) if mm: self.log_debug("Rejected: " + title) continue title = re.sub('\[.*\] ', '', post.title) self.range_checkr(link,title) if len(self.config.get('pushbulletapi')) > 2: notifyPushbulletSJ(self.config.get("pushbulletapi"),self.added_items) if len(self.added_items) > 0 else True def range_checkr(self, link, title): pattern = re.match(".*S\d{2}E\d{2}-\w?\d{2}.*", title) if pattern is not None: range0 = re.sub(r".*S\d{2}E(\d{2}-\w?\d{2}).*",r"\1", title).replace("E","") number1 = re.sub(r"(\d{2})-\d{2}",r"\1", range0) number2 = re.sub(r"\d{2}-(\d{2})",r"\1", range0) title_cut = re.findall(r"(.*S\d{2}E)(\d{2}-\w?\d{2})(.*)",title) try: for count in range(int(number1),(int(number2)+1)): NR = re.match("d\{2}", str(count)) if NR is not None: title1 = title_cut[0][0] + str(count) + ".*" + title_cut[0][-1] self.range_parse(link, title1) else: title1 = title_cut[0][0] + "0" + str(count) + ".*" + title_cut[0][-1] self.range_parse(link, title1) except ValueError as e: logging.error("Raised ValueError exception: %s" %e.message) else: self.parse_download(link, title) def range_parse(self,series_url, search_title): req_page = getURL(series_url) soup = BeautifulSoup(req_page) try: titles = soup.findAll(text=re.compile(search_title)) for title in titles: if self.quality !='480p' and self.quality in title: self.parse_download(series_url, title) if self.quality =='480p' and not (('.720p.' in title) or ('.1080p.' in title)): self.parse_download(series_url, title) except re.error as e: self.log_error('sre_constants.error: %s' % e) def parse_download(self,series_url, search_title): req_page = getURL(series_url) soup = BeautifulSoup(req_page) title = soup.find(text=re.compile(search_title)) if title: items = [] links = title.parent.parent.findAll('a') for link in links: url = link['href'] pattern = '.*%s_.*' % self.hoster if re.match(pattern, url): items.append(url) self.send_package(title,items) if len(items) > 0 else True def send_package(self, title, link): try: storage = self.db.retrieve(title) except Exception as e: self.log_debug("db.retrieve got exception: %s, title: %s" % (e,title)) if storage == 'downloaded': self.log_debug(title + " already downloaded") else: self.log_info("NEW RELEASE: " + title) self.db.store(title, 'downloaded') write_crawljob_file(title, title, link, self.config.get('crawljob_directory')) and self.added_items.append(title.encode("utf-8"))
class MovieblogFeed(): FEED_URL = "http://www.movie-blog.org/feed/" SUBSTITUTE = "[&#\s/]" _INTERNAL_NAME='MB' def __init__(self): self.config = RssConfig(self._INTERNAL_NAME) self.log_info = logging.info self.log_error = logging.error self.log_debug = logging.debug list([_mkdir_p(os.path.dirname(self.config.get(f))) for f in ['db_file', 'patternfile']]) _mkdir_p(self.config.get('crawljob_directory')) self.db = RssDb(self.config.get('db_file')) self._periodical_active = False self.periodical = RepeatableTimer( int(self.config.get('interval')) * 60, self.periodical_task ) def activate(self): self._periodical_active = True self.periodical.start() return self def readInput(self): if not os.path.isfile(self.config.get("patternfile")): open(self.config.get("patternfile"), "a").close() try: f = codecs.open(self.config.get("patternfile"), "rb") return f.read().splitlines() except: self.log_error("Inputfile not found") def getPatterns(self): out = {} for line in self.mypatterns: if len(line) == 0 or line.startswith("#"): continue try: n = line.split(",")[0] q = line.split(",")[1] r = line.split(",")[2] except: self.log_error("Syntax error in [%s] detected, please take corrective action" %self.config.get("patternfile")) try: d = line.split(",")[3] except: d = "" if q == "": q = r'.*' if r == "": r = r'.*' out[n] = [q,r,d] return out def searchLinks(self): ignore = self.config.get("ignore").lower().replace(",","|") if not self.config.get("ignore") == "" else "^unmatchable$" for key in self.allInfos: s = re.sub(self.SUBSTITUTE,".",key).lower() for post in self.feed.entries: """Search for title""" found = re.search(s,post.title.lower()) if found: """Check if we have to ignore it""" found = re.search(ignore,post.title.lower()) if found: self.log_debug("Ignoring [%s]" %post.title) continue """Search for quality""" ss = self.allInfos[key][0].lower() if ss == "480p": if "720p" in post.title.lower() or "1080p" in post.title.lower() or "1080i" in post.title.lower(): continue found = True else: found = re.search(ss,post.title.lower()) if found: """Search for releasegroup""" sss = "[\.-]+"+self.allInfos[key][1].lower() found = re.search(sss,post.title.lower()) if found: try: episode = re.search(r'([\w\.\s]*s\d{1,2}e\d{1,2})[\w\.\s]*',post.title.lower()).group(1) if "repack" in post.title.lower(): episode = episode + "-repack" self.log_debug("TV-Series detected, will shorten its name to [%s]" %episode) self.dictWithNamesAndLinks[episode] = [post.link] except: self.dictWithNamesAndLinks[post.title] = [post.link] @_restart_timer def periodical_task(self): urls = [] text = [] self.mypatterns = self.readInput() self.dictWithNamesAndLinks = {} self.allInfos = self.getPatterns() if self.config.get("historical"): for xline in self.mypatterns: if len(xline) == 0 or xline.startswith("#"): continue xn = xline.split(",")[0].replace(".", " ").replace(" ", "+") urls.append('http://www.movie-blog.org/search/%s/feed/rss2/' %xn) else: urls.append(self.FEED_URL) for url in urls: self.feed = feedparser.parse(url) self.searchLinks() for key in self.dictWithNamesAndLinks: if not self.db.retrieve(key) == 'added': self.db.store(key, 'added') self.log_info("NEW RELEASE: " + key) write_crawljob_file(key, key, [self.dictWithNamesAndLinks[key][0]], self.config.get("crawljob_directory")) and text.append(key) else: self.log_debug("[%s] has already been added" %key) if len(text) > 0: notifyPushbulletMB(self.config.get("pushbulletapi"),text)
def load(dockerglobal): main = RssConfig('RSScrawler') jdownloader = main.get("jdownloader") port = main.get("port") prefix = main.get("prefix") interval = main.get("interval") hoster = main.get("hoster") pushbulletapi = main.get("pushbulletapi") # MB-Bereich mb = RssConfig('MB') mbquality = mb.get("quality") ignore = mb.get("ignore") historical = str(mb.get("historical")) mbregex = str(mb.get("regex")) cutoff = str(mb.get("cutoff")) crawl3d = str(mb.get("crawl3d")) enforcedl = str(mb.get("enforcedl")) crawlseasons = str(mb.get("crawlseasons")) seasonsquality = mb.get("seasonsquality") seasonssource = mb.get("seasonssource") # SJ-Bereich sj = RssConfig('SJ') sjquality = sj.get("quality") rejectlist = sj.get("rejectlist") sjregex = str(sj.get("regex")) # Wandle Werte für HTML um if hoster == 'Share-Online': hosterso = ' selected' hosterul = '' else: hosterso = '' hosterul = ' selected' if mbquality == '1080p': mbq1080 = ' selected' mbq720 = '' mbq480 = '' if mbquality == '720p': mbq1080 = '' mbq720 = ' selected' mbq480 = '' if mbquality == '480p': mbq1080 = '' mbq720 = '' mbq480 = ' selected' if seasonsquality == '1080p': msq1080 = ' selected' msq720 = '' msq480 = '' if seasonsquality == '720p': msq1080 = '' msq720 = ' selected' msq480 = '' if seasonsquality == '480p': msq1080 = '' msq720 = '' msq480 = ' selected' if sjquality == '1080p': sjq1080 = ' selected' sjq720 = '' sjq480 = '' if sjquality == '720p': sjq1080 = '' sjq720 = ' selected' sjq480 = '' if sjquality == '480p': sjq1080 = '' sjq720 = '' sjq480 = ' selected' if historical == 'True': historicaltrue = ' selected' historicalfalse = '' else: historicaltrue = '' historicalfalse = ' selected' if mbregex == 'True': mbregextrue = ' selected' mbregexfalse = '' mrdiv = "block" else: mbregextrue = '' mbregexfalse = ' selected' mrdiv = "none" if cutoff == 'True': cutofftrue = ' selected' cutofffalse = '' else: cutofftrue = '' cutofffalse = ' selected' if crawl3d == 'True': crawl3dtrue = ' selected' crawl3dfalse = '' tddiv = "block" else: crawl3dtrue = '' crawl3dfalse = ' selected' tddiv = "none" if enforcedl == 'True': enforcedltrue = ' selected' enforcedlfalse = '' else: enforcedltrue = '' enforcedlfalse = ' selected' if crawlseasons == 'True': crawlseasonstrue = ' selected' crawlseasonsfalse = '' ssdiv = "block" else: crawlseasonstrue = '' crawlseasonsfalse = ' selected' ssdiv = "none" if sjregex == 'True': sjregextrue = ' selected' sjregexfalse = '' srdiv = "block" else: sjregextrue = '' sjregexfalse = ' selected' srdiv = "none" # Erkenne Prefix if prefix: prefix = '/' + prefix # Erkenne Docker Umgebung if dockerglobal == '1': dockerblocker = ' readonly="readonly"' dockerhint = 'Docker-Modus: Kann nur per Docker-Run angepasst werden! ' else: dockerblocker = '' dockerhint = '' return (jdownloader, port, prefix, interval, hoster, pushbulletapi, mbquality, ignore, historical, mbregex, cutoff, crawl3d, enforcedl, crawlseasons, seasonsquality, seasonssource, sjquality, rejectlist, sjregex, hosterso, hosterul, mbq1080, mbq720, mbq480, msq1080, msq720, msq480, sjq1080, sjq720, sjq480, historicaltrue, historicalfalse, mbregextrue, mbregexfalse, mrdiv, cutofftrue, cutofffalse, crawl3dtrue, crawl3dfalse, tddiv, enforcedltrue, enforcedlfalse, crawlseasonstrue, crawlseasonsfalse, ssdiv, sjregextrue, sjregexfalse, srdiv, dockerblocker, dockerhint)
def write_crawljob_file(package_name, folder_name, link_text, crawljob_dir, subdir): # Crawljobs enden auf .crawljob crawljob_file = crawljob_dir + '/%s.crawljob' % unicode( # Windows-inkompatible Sonderzeichen/Leerzeichen werden ersetzt re.sub('[^\w\s\.-]', '', package_name.replace(' ', '')).strip().lower()) # Versuche .crawljob zu schreiben crawljobs = RssConfig('Crawljobs') autostart = crawljobs.get("autostart") usesubdir = crawljobs.get("subdir") if usesubdir == "False": subdir = "" if autostart == "True": autostart = "TRUE" else: autostart = "FALSE" try: # Öffne Crawljob mit Schreibzugriff file = open(crawljob_file, 'w') # Optionen für Paketeigenschaften im JDownloader: # Paket ist aktiviert file.write('enabled=TRUE\n') # Download startet automatisch file.write('autoStart=' + autostart + '\n') # Passwörter hinzufügen file.write('extractPasswords=["' + "bW92aWUtYmxvZy5vcmc=".decode('base64') + '","' + "c2VyaWVuanVua2llcy5vcmc=".decode('base64') + '"]\n') # Archive automatisch entpacken file.write('extractAfterDownload=TRUE\n') # Erzwinge automatischen Start file.write('forcedStart=' + autostart + '\n') # Bestätige Fragen des JDownloaders automatisch file.write('autoConfirm=' + autostart + '\n') # Unterverzeichnis des Downloads ist folder_name & subdir wird wenn es nicht leer ist mit angegeben. Subdir hilft bei der Automatisierung (bspw. über Filebot). if not subdir == "": file.write('downloadFolder=' + subdir + "/" + '%s\n' % folder_name) # Niedrige Priorität für erzwungene zweisprachige Downloads if subdir == "RSScrawler/Remux": file.write('priority=Lower\n') else: file.write('downloadFolder=' + '%s\n' % folder_name) # Name des Pakets im JDownloader ist package_name (ohne Leerzeichen!) file.write('packageName=%s\n' % package_name.replace(' ', '')) # Nutze ersten Eintrag (lt. Code einzigen!) des link_text Arrays als Downloadlink file.write('text=%s\n' % link_text) # Beende Schreibvorgang file.close() # Bestätige erfolgreichen Schreibvorgang return True # Bei Fehlern: except UnicodeEncodeError as e: # Beende Schreibvorgang file.close() # Erläutere den Fehler im Log inkl. Dateipfad des Crawljobs und Fehlerbericht logging.error("Beim Schreibversuch des Crawljobs: %s FEHLER: %s" % (crawljob_file, e.message)) # Wenn hiernach ein fehlerhafter Crawljob zurück bleibt if os.path.isfile(crawljob_file): # Logge das weitere Vorgehen logging.info("Entferne defekten Crawljob: %s" % crawljob_file) # Entferne den Crawljob os.remove(crawljob_file) # Vermerke fehlgeschlagenen Schreibvorgang return False
class MovieblogFeed(): FEED_URL = "http://www.movie-blog.org/feed/" SUBSTITUTE = "[&#\s/]" _INTERNAL_NAME='MB' def __init__(self): self.config = RssConfig(self._INTERNAL_NAME) self.log_info = logging.info self.log_error = logging.error self.log_debug = logging.debug list([_mkdir_p(os.path.dirname(self.config.get(f))) for f in ['db_file', 'patternfile']]) _mkdir_p(self.config.get('crawljob_directory')) self.db = RssDb(self.config.get('db_file')) self._hosters_pattern = self.config.get('hoster').replace(';','|') self._periodical_active = False self.periodical = RepeatableTimer( int(self.config.get('interval')) * 60, self.periodical_task ) self.dictWithNamesAndLinks = {} def activate(self): self._periodical_active = True self.periodical.start() return self def readInput(self, file): if not os.path.isfile(file): open(file, "a").close() placeholder = open(file, 'w') placeholder.write('ADD ALL MOVIES YOU WANT TO CRAWL FOR AS NEW LINES IN THIS FILE\n') placeholder.close() try: f = codecs.open(file, "rb") return f.read().splitlines() except: self.log_error("Inputfile not found") def getPatterns(self, patterns, quality, rg, sf): return {line: (quality, rg, sf) for line in patterns} def searchLinks(self, feed): ignore = "|".join(["\.%s\." % p for p in self.config.get("ignore").lower().split(',') if not self.config.get('crawl3d') or p != '3d']) \ if not self.config.get("ignore") == "" else "^unmatchable$" for key in self.allInfos: s = re.sub(self.SUBSTITUTE,".",key).lower() for post in feed.entries: """Search for title""" found = re.search(s,post.title.lower()) if found: """Check if we have to ignore it""" found = re.search(ignore,post.title.lower()) if found: self.log_debug("Ignoring [%s]" %post.title) continue """Search for quality""" ss = self.allInfos[key][0].lower() if '.3d.' in post.title.lower(): if self.config.get('crawl3d') and ("1080p" in post.title.lower() or "1080i" in post.title.lower()): found = True else: continue else: if ss == "480p": if "720p" in post.title.lower() or "1080p" in post.title.lower() or "1080i" in post.title.lower(): continue found = True else: found = re.search(ss,post.title.lower()) if found: """Search for releasegroup""" sss = "[\.-]+"+self.allInfos[key][1].lower() found = re.search(sss,post.title.lower()) if self.allInfos[key][2]: # If all True, then found = True found = all([word in post.title.lower() for word in self.allInfos[key][2]]) if found: try: episode = re.search(r'([\w\.\s]*s\d{1,2}e\d{1,2})[\w\.\s]*',post.title.lower()).group(1) if "repack" in post.title.lower(): episode = episode + "-repack" self.log_debug("TV-Series detected, will shorten its name to [%s]" %episode) yield (episode, [post.link], key) except: yield (post.title, [post.link], key) def _get_download_links(self, url, hosters_pattern=None): tree = html.fromstring(requests.get(url).content) xpath = '//*[@id="content"]/span/div/div[2]/p//strong[contains(text(),"Download:") or contains(text(),"Mirror #")]/following-sibling::a[1]' return [common.get_first(link.xpath('./@href')) for link in tree.xpath(xpath) if hosters_pattern is None or re.search(hosters_pattern, link.text, flags=re.IGNORECASE)] @_restart_timer def periodical_task(self): urls = [] text = [] dl = {key:('.*', '.*', ('.dl.',)) for key in self.db.get_patterns('notdl')} self.allInfos = dict( set({key: dl[key] if key in dl else value for (key, value) in self.getPatterns( self.readInput(self.config.get("patternfile")), self.config.get('quality'), '.*', None ).items()}.items() ) | set(self.getPatterns( self.readInput(self.config.get("seasonslist")), self.config.get('seasonsquality'), '.*', ('.complete.','.' + self.config.get('seasonssource') + '.') ).items() if self.config.get('crawlseasons') else []) ) if self.config.get("historical"): for xline in self.allInfos.keys(): if len(xline) > 0 and not xline.startswith("#"): xn = xline.split(",")[0].replace(".", " ").replace(" ", "+") urls.append('http://www.movie-blog.org/search/%s/feed/rss2/' %xn) else: urls.append(self.FEED_URL) for url in urls: for (key, value, pattern) in self.searchLinks(feedparser.parse(url)): if self.db.retrieve(key) == 'added' or self.db.retrieve(key) == 'notdl': self.log_debug("[%s] has already been added" % key) else: self.db.store( key, 'notdl' if self.config.get('enforcedl') and '.dl.' not in key.lower() else 'added', pattern ) self.log_info("NEW RELEASE: " + key) download_link = [common.get_first(self._get_download_links(value[0], self._hosters_pattern))] if any(download_link): write_crawljob_file( key, key, download_link, self.config.get("crawljob_directory") ) and text.append(key) if len(text) > 0: notifyPushbulletMB(self.config.get("pushbulletapi"),text)