def main(): """pass""" # load weibonames path = os.path.abspath( os.path.join(os.path.dirname(__file__), 'conf/weibonames.conf')) with open(path) as fp: weibonames = [ line.strip().decode('utf_8') for line in fp if not line.startswith('#') ] # create database dbpath = os.path.abspath(os.path.join(os.path.dirname(__file__), DBNAME)) if not os.path.exists(dbpath): create_database(dbpath) # start crawler thread for name in weibonames: crawler = CrawlerThread(weiboname=name) crawler.start() # start download thread downloader = DownloadManager() downloader.start()
def __init__(self ): self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} self.files = [] self.file_rule = ".+"
def download(self, item, startCB, finishCB, playDownload=False, mode="", overrideCB=None): """Downloads item PVideo itemem calls startCB when download starts and finishCB when download finishes """ quiet = False headers = item.settings['extra-headers'] log.debug("Download headers %s", headers) downloadManager = DownloadManager.getInstance() d = downloadManager.createDownload(name=item.name, url=item.url, stream=item.stream, filename=item.filename, live=item.live, destination=self.downloads_path, startCB=startCB, finishCB=finishCB, quiet=quiet, playDownload=playDownload, headers=headers, mode=mode) if item.subs is not None and item.subs != '': log.debug('subtitles link: %s', item.subs) subs_file_path = os.path.splitext(d.local)[0] + '.srt' util.download_to_file(item.subs, subs_file_path) downloadManager.addDownload(d, overrideCB)
def get_downloads(self): video_lst = [] if not os.path.isdir(self.downloads_path): util.make_path(self.downloads_path) downloads = os.listdir(self.downloads_path) for download in downloads: download_path = os.path.join(self.downloads_path, download) if os.path.isdir(download_path): continue if os.path.splitext(download_path)[1] in VIDEO_EXTENSIONS: filename = os.path.basename(os.path.splitext(download_path)[0]) url = download_path subs = None if filename in [os.path.splitext(x)[0] for x in downloads if os.path.splitext(x)[1] in SUBTITLES_EXTENSIONS]: subs = filename + ".srt" it = PDownload(download_path) it.name = filename it.url = url it.subs = subs downloadManager = DownloadManager.getInstance() download = downloadManager.findDownloadByIT(it) if download is not None: it.finish_time = download.finish_time it.start_time = download.start_time it.state = download.state it.textState = download.textState video_lst.append(it) return video_lst
def do_download(): try: # have to rename to start_cb otherwise python # doesnt see start_callback start_cb = start_callback finish_cb = finish_callback if start_cb is None: start_cb = DownloadManagerMessages.startDownloadCB if finish_cb is None: finish_cb = DownloadManagerMessages.finishDownloadCB override_cb = DownloadManagerMessages.overrideDownloadCB downloadManager = DownloadManager.getInstance() d = downloadManager.createDownload( name=item.name, url=item.url, stream=item.stream, filename=filename[0], live=item.live, destination=destination[0], startCB=start_cb, finishCB=finish_cb, quiet=False, playDownload=play_download, headers=headers, mode=mode) if item.subs: remote = item.subs local = os.path.splitext(d.local)[0] + '.srt' if os.path.isfile(remote): copyfile(remote, local) elif remote.startswith('http'): util.download_to_file(remote, local) downloadManager.addDownload(d, override_cb) except: log.logError("Download '%s' failed.\n%s"%(item.name, traceback.format_exc())) session.openWithCallback(ask_if_download_callback, MessageBox, text=_("Download error, look into the log file."), timeout=10, type=MessageBox.TYPE_ERROR) pass
def do_download(): # have to rename to start_cb otherwise python # doesnt see start_callback start_cb = start_callback finish_cb = finish_callback if start_cb is None: start_cb = DownloadManagerMessages.startDownloadCB if finish_cb is None: finish_cb = DownloadManagerMessages.finishDownloadCB override_cb = DownloadManagerMessages.overrideDownloadCB downloadManager = DownloadManager.getInstance() d = downloadManager.createDownload(name=item.name, url=item.url, stream=item.stream, filename=item.filename, live=item.live, destination=destination[0], startCB=start_cb, finishCB=finish_cb, quiet=False, playDownload=play_download, headers=headers, mode=mode) if item.subs: remote = item.subs local = os.path.splitext(d.local)[0] + '.srt' if os.path.isfile(remote): copyfile(remote, local) elif remote.startswith('http'): util.download_to_file(remote, local) downloadManager.addDownload(d, override_cb)
def __init__(self, webconf): self.webconf = webconf config = ConfigParser.ConfigParser() config.readfp(open('/etc/raspadmin/downloader.conf')) self.path = config.get("PATH", "downloadrep") self.alldebrid = 0 try: if config.get("ALLDEBRID", "usealldebrid").upper() == "Y": self.alldebriduser = config.get("ALLDEBRID", "alldebriduser") self.alldebridpass = config.get("ALLDEBRID", "alldebridpass") self.alldebrid = AllDebrid(self.alldebriduser, self.alldebridpass) except: self.alldebrid = None self.downloadManager = DownloadManager(self.path) rarfile.UNRAR_TOOL = "unrar-nonfree"
class Crawler(): def __init__(self ): self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self,url): patns = [] for purl,ru in self.rules.items(): if purl.match(url)!= None: patns.extend(ru) return list(set(patns)) def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html !=None: self.webpagedb.html2db(url,html) self.webpage = WebPage(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) self.add_seeds(links) self.mysleep(3) def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep",i,"of",n
class Crawler(): def __init__(self): self.downloader = DownloadManager() self.webpage = None self.rules = {} self.dbop = OperatorDB() def add_seeds(self, links): self.dbop.add_seeds(links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self, url): patns = [] for purl, ru in self.rules.items(): if purl.match(url) != None: patns.extend(ru) return list(set(patns)) def start(self): while 1: try: url = self.dbop.pop_url() print "url: %s" % url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html != None: self.webpage = WebPage(url, html) article = self.webpage.extract() if len(article) > 5: addtime = "%s %s" % (article[1], article[2]) self.dbop.html2db(url, html, article[0], addtime, article[3], article[5]) else: self.dbop.html2db(url, html) print self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) links = self.webpage.filter_links(tags=['a'], str_patterns=ruptn) self.add_seeds(links) self.mysleep(3) except Exception, err: print "!!error!! Exception happend! %s %s" % (url, err) self.dbop.close()
class WebManager(WebStructure.WebAbstract): def __init__(self, webconf): self.webconf = webconf config = ConfigParser.ConfigParser() config.readfp(open('/etc/raspadmin/downloader.conf')) self.path = config.get("PATH", "downloadrep") self.downloadManager = DownloadManager(self.path) def manage_download(self, str_list): list_download = [ y for y in (x.strip() for x in str_list.splitlines()) if y ] for url in list_download: self.downloadManager.addDownload(url) def get_html(self, http_context): template = ['header.tpl', 'downloader/downloader.tpl', 'footer.tpl'] sessionid = http_context.sessionid sessionvars = http_context.session.get_vars(sessionid) post = http_context.http_post if 'str_download' in post.keys(): self.manage_download(post['str_download']) if http_context.suburl == 'getInfo': return WebStructure.HttpContext( statuscode=200, content=json.dumps(self.downloadManager.downloadStatus()), template=None, mimetype='text/html') content = { 'token': sessionvars['posttoken'], 'includefile': 'downloader/headerdownloader.html' } return WebStructure.HttpContext(statuscode=200, content=content, template=template, mimetype='text/html') def get_module_name(self): return "Downloader"
def __init__(self): self._cfg = _load_cfg() try: with open(os.path.join(APP_PATH, 'data.json')) as json_fp: user_data = json.load(json_fp) except IOError: logging.warning('No user data') user_data = {} self._download_manager = DownloadManager(self._cfg['user'], self._cfg['password'], user_data.get('session_id'))
def download(self, item, startCB, finishCB, playDownload=False, mode="", overrideCB=None): """Downloads item PVideo itemem calls startCB when download starts and finishCB when download finishes """ quiet = False headers = item.settings['extra-headers'] log.debug("Download headers %s", headers) downloadManager = DownloadManager.getInstance() d = downloadManager.createDownload(name=item.name, url=item.url, stream=item.stream, filename=item.filename, live=item.live, destination=self.downloads_path, startCB=startCB, finishCB=finishCB, quiet=quiet, playDownload=playDownload, headers=headers, mode=mode) if item.subs is not None and item.subs != '': log.debug('subtitles link: %s' , item.subs) subs_file_path = os.path.splitext(d.local)[0] + '.srt' util.download_to_file(item.subs, subs_file_path) downloadManager.addDownload(d, overrideCB)
def handleRolloutRequest(cmd): print("Rollouts correlationId is: " + str(cmd.getRolloutsCorrelationId())) # print('Parsing software module information') for swMod in cmd.getSoftwareModules(): execResult = "" featureId = swMod.name.replace(":", "-") + "-" + swMod.version swCache = SoftwareFeatureCache.loadOrCreate(featureId) # print(swMod.toJson()) for art in swMod.artifacts: updateLastOperation(cmd, "DOWNLOADING", "Downloading " + art.name, swMod) filePath = DownloadManager().download(art) swCache.addFile(filePath) updateLastOperation(cmd, "DOWNLOADED", "Downloaded " + art.name, swMod) # # https://vorto.eclipseprojects.io/#/details/vorto.private.test:Executor:1.0.0 updateLastOperation(cmd, "INSTALLING", "Executing script: " + filePath, swMod) res = "Installed a script to the location {}.\n".format(filePath) updateLastOperation(cmd, "INSTALLED", execResult, swMod) execResult += res swCache.save() swCache.updateDittoFeature(client, deviceInfo, execResult) updateLastOperation(cmd, "FINISHED_SUCCESS", execResult, swMod)
def main(ctx, url, name, num_threads): num_downloads = num_threads url = url file_name = name if not file_name: file_name = parse_filename(url) with OneLinePrinter("Initializing..."): manager = DownloadManager(url) manager.create_downloads(num_downloads) file_size = manager.size formatter = ProgressBarFormatter('Downloading') with OneLinePrinter(formatter): with DownloadFile(file_name, file_size) as f: for download in manager.iter_downloads(): b = formatter.create_bar(download.length) formatter.add_bar(b) download.bind(on_chunk, b, f) download.start() for download in manager.iter_downloads(): download.join()
# form def get_form(self, index): form = self.doc.forms[index] form.action = urlparse.urljoin(self.url, form.action) return form.action, form.fields # def get_html(self): return self.html if __name__ == "__main__": import time from downloader import DownloadManager downloader = DownloadManager() url = "http://www.cs.colorado.edu/" error_msg, url, redirected_url, html = downloader.download(url) print error_msg, url, redirected_url, len(html) time.sleep(2) page = WebPage(url, html) page.parse_links() links = page.filter_links(tags=["a"], str_patterns=["^(http://www\.cs\.colorado\.edu)(/info.+)$"]) elements = page.doc.findall("./body//div") for e in elements: print "ELEMETNS ==========================================" print lxml.html.tostring(e, pretty_print=True) print "ITEMS------------------------------------------"
# form def get_form(self, index): form = self.doc.forms[index] form.action = urlparse.urljoin(self.url, form.action) return form.action, form.fields # def get_html(self): return self.html if __name__ == "__main__": import time from downloader import DownloadManager downloader = DownloadManager() url = "http://www.cs.colorado.edu/" error_msg, url, redirected_url, html = downloader.download(url) print error_msg, url, redirected_url, len(html) time.sleep(2) page = WebPage(url, html) page.parse_links() links = page.filter_links(tags=['a'],patterns=['^(http://www\.cs\.colorado\.edu)(/info.+)$']) elements = page.doc.findall('./body//div') for e in elements: print "ELEMETNS ==========================================" print lxml.html.tostring(e,pretty_print=True) print "ITEMS------------------------------------------"
class Server(object): SHUTDOWN_TIMEOUT = 60 def __init__(self): self._cfg = _load_cfg() try: with open(os.path.join(APP_PATH, 'data.json')) as json_fp: user_data = json.load(json_fp) except IOError: logging.warning('No user data') user_data = {} self._download_manager = DownloadManager(self._cfg['user'], self._cfg['password'], user_data.get('session_id')) def stop(self): self._bottle_server.stop() def _on_heartbeat_timeout(self, heartbeat): ''' Called if we haven't had a heartbeat in a while ''' if any(self._download_manager.get_downloading()): logging.debug('No heartbeat but downloading.... Still alive') heartbeat.beat() else: logging.debug('No heartbeat, no downloads. Stopping...') self._bottle_server.stop() def _on_dl_finished(self, path, file_name): ''' TODO this needs sorting ''' import shutil save_path = os.path.join(self._cfg['save_location'], file_name) logging.debug('Moving <%s> to <%s>' % (path, save_path,)) shutil.move(path, save_path) def start_download(self, game, game_id, mod_id, file_id): self._download_manager.download(self._on_dl_finished, game, game_id, mod_id, file_id) def start_server(self, host, port): self._bottle_server = StoppableWSGIRefServer(host=host, port=port) hb = Heartbeat() install(partial(local_variable_plugin, { 'cfg':self._cfg, 'heartbeat': hb, 'server' : self, 'download_manager' : self._download_manager, })) hb_monitor = HeartbeatMonitor(hb, self.SHUTDOWN_TIMEOUT, self._on_heartbeat_timeout) hb_monitor.monitor() run(server=self._bottle_server) self._download_manager.stop() hb_monitor.stop() _save_cfg(self._cfg)
def __init__(self): super(Crawler, self).__init__() self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {}
def create_downloader(self): # Instantiate class that takes care of downloading videos dl = DownloadManager(self.app.save_dir) return dl
class Crawler(object): def __init__(self): super(Crawler, self).__init__() self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self, url): patns = [] for purl, ru in self.rules.items(): if purl.match(url) != None: patns.extend(ru) return list(set(patns)) def getlinks(self, url, html): self.webpage = WebPage(url, html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags=['a'], patterns=ruptn) return links def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download( url) #print error_msg, url, redirected_url, html if html != None: self.webpagedb.html2db(url, html) links = self.getlinks(url, html) self.add_seeds(links) self.mysleep(3) def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep", i, "of", n
def __init__(self ): super(Crawler, self).__init__() self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {}
def __init__(self, webconf): self.webconf = webconf config = ConfigParser.ConfigParser() config.readfp(open('/etc/raspadmin/downloader.conf')) self.path = config.get("PATH", "downloadrep") self.downloadManager = DownloadManager(self.path)
def __init__(self): self.downloader = DownloadManager() self.webpage = None self.rules = {} self.dbop = OperatorDB()
class Crawler(): def __init__(self ): self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} self.files = [] self.file_rule = ".+" def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') self.repodb = RepoStateDB() def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def set_file_rule(self, rule): self.file_rule = rule def get_patterns_from_rules(self,url): patns = [] for purl,ru in self.rules.items(): if purl.match(url)!= None: patns.extend(ru) return list(set(patns)) def download_files(self, files): for f in files: #cmd = "wget --force-directories -c " + f + " -P " + config.repos_dir cmd = "wget -c " + f + " -P " + config.repos_dir ret_code = os.system(cmd) self.repodb.update(f, ret_code == 0) def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) # print error_msg, url, redirected_url, html if html !=None: self.webpagedb.html2db(url,html) self.webpage = WebPage(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) print links self.add_seeds(links) file_pattern = [] file_pattern.append(re.compile(self.file_rule)) files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern) self.files.append(files) #TODO: self.download_files(files) print files def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep",i,"of",n
class WebManager(WebStructure.WebAbstract): def __init__(self, webconf): self.webconf = webconf config = ConfigParser.ConfigParser() config.readfp(open('/etc/raspadmin/downloader.conf')) self.path = config.get("PATH", "downloadrep") self.alldebrid = 0 try: if config.get("ALLDEBRID", "usealldebrid").upper() == "Y": self.alldebriduser = config.get("ALLDEBRID", "alldebriduser") self.alldebridpass = config.get("ALLDEBRID", "alldebridpass") self.alldebrid = AllDebrid(self.alldebriduser, self.alldebridpass) except: self.alldebrid = None self.downloadManager = DownloadManager(self.path) rarfile.UNRAR_TOOL = "unrar-nonfree" def manage_download(self, str_list): list_download = [ y for y in (x.strip() for x in str_list.splitlines()) if y ] for url in list_download: size = 0 if self.alldebrid != None: if self.alldebrid.isProvider(url): url2 = url (error, url) = self.alldebrid.getLink(url) if error != 0: url = url2 else: print error # https://github.com/usineur/go-debrid/blob/master/alldebrid/debrid.go self.downloadManager.addDownload(url) def unrar(self, file): if self.downloadManager.updateStatus(file, 'r'): try: o = rarfile.RarFile(self.path + '/' + file) o.extractall(self.path) self.downloadManager.updateStatus(file, 'dr') except Exception as e: self.downloadManager.updateStatus(file, 'df') def get_html(self, http_context): template = ['header.tpl', 'downloader/downloader.tpl', 'footer.tpl'] sessionid = http_context.sessionid sessionvars = http_context.session.get_vars(sessionid) post = http_context.http_post if 'str_download' in post.keys(): self.manage_download(post['str_download']) if http_context.suburl == 'getInfo': return WebStructure.HttpContext( statuscode=200, content=json.dumps(self.downloadManager.downloadStatus()), template=None, mimetype='text/html') if http_context.suburl == 'unrar' and 'str_file' in http_context.http_get: try: t = Thread(target=self.unrar, args=(http_context.http_get['str_file'], )) t.daemon = True t.start() except Exception as e: print repr(e) content = { 'token': sessionvars['posttoken'], 'includefile': 'downloader/headerdownloader.html' } return WebStructure.HttpContext(statuscode=200, content=content, template=template, mimetype='text/html') def get_module_name(self): return "Downloader"