def index(request): pip = os.path.join(sys.exec_prefix, 'bin', 'pip') if not os.path.isfile(pip): pip = 'pip' SHELL_COMMANDS = ( ('Hostname','hostname'), ('hg version', 'hg id'), ('git version', "git log --pretty=format:'%h' -n 1"), ('hg branch', 'hg branch'), ('git branch', 'git rev-parse --abbrev-ref HEAD'), ('MySQL version', 'mysql --version'), ('Local Packages', '%s freeze -l' % pip) ) SD = OrderedDict() for k,v in sorted(settings_list(), key=lambda x: x[0]): SD[k] = v context = RequestContext(request, { 'args': sys.argv, 'exe': sys.executable, 'settings': SD, }) context['versions'] = OrderedDict() # get versions curr_dir = os.path.realpath(os.path.dirname(__file__)) for name, shell_command in SHELL_COMMANDS: try: result = utils.run_shell_command(shell_command, curr_dir) if result: if isinstance(result, list): result = '<br>'.split(result) context['versions'][name] = result except: pass # machine status context['machine'] = OrderedDict() if sys.platform == 'darwin': context['machine']['Uptime'] = 'not done yet on MacOS' context['machine']['Disk Space'] = 'not done yet on MacOS' elif sys.platform == 'win32': context['machine']['Uptime'] = 'not done yet on Windows' context['machine']['Disk Space'] = 'not done yet on Windows' else: context['machine']['Uptime'] = utils.server_uptime() context['machine']['Disk Space'] = utils.disk_usage('/')._asdict() if os.path.exists(settings.MEDIA_ROOT): context['machine']['Media Folder'] = utils.sizeof_fmt(utils.folder_size(settings.MEDIA_ROOT)) context['stats'] = utils.get_available_stats() context['apps'] = [(app.__name__, ', '.join([model.__name__ for model in models])) for app, models in all_concrete_models()] context['relations'] = [[(model.__name__, ', '.join(['%s (%s) through %s' % (relation.__name__, relation.__module__, field.__class__.__name__) for field, relation in relations]), app.__name__) for model, relations in rel_info] for app, rel_info in all_relations()] #context['rel_graph'] = context['config_warnings'] = utils.get_configuration_warnings() return render_to_response('dashboard/index.html', context)
def download(self): work_folder = os.path.dirname(smart_str(self.resource.content_root())) print "- CREATING DIR..." subprocess.call("mkdir -vp %s" % work_folder, shell=True) # run downloader urltodownload = str(self.resource.resource_download_url) basetarget = str(work_folder) try: downloader(urltodownload, basetarget) self.resource.status = 'downloaded' size = folder_size(work_folder) self.resource.size = size contents = os.listdir(work_folder) if contents: self.resource.trigger = contents[0] self.resource.trigger_extensions = self.resource.trigger.rsplit('.')[-1] self.resource.resource_downloaded_file = self.resource.trigger except: self.resource.status = 'error' self.resource.save() raise
def post_save(self): """ Called when dataset save process (tar, dumps, etc.) is done """ ftp_backup = self.get_option('ftp_backup', []).split(',') for host in ftp_backup: host_options = self.savior.hosts[host] kwargs = { "dataset_name": self.name, "local_saves_directory":self.savior.save_path, "dataset_save_id":self.savior.stamp_str, } connector = mapping.MAPPING['ftpupload']( host_options=host_options, **kwargs ) connector.upload() keep_local_saves = self.convert_to_boolean(self.get_option('keep_local_saves')) self.size += folder_size(self.current_save_directory, human=False) if not keep_local_saves: self.remove_local_save() self.remove_old_saves() return True
def upload_manager(): global config, default_check_interval try: default_check_interval = config['local_folder_check_interval'] logger.debug("Started upload manager for %r", config['local_folder']) while True: time.sleep(60 * config['local_folder_check_interval']) # restore check interval to original after an extended sleep after a rate limit ban (25hrs) if config['local_folder_check_interval'] == 1500: config['local_folder_check_interval'] = default_check_interval logger.info( "Restored local_folder_check_interval to %d minutes after an extended sleep (25 hours) due " "to the last upload being cancelled due to rate limits!", config['local_folder_check_interval']) if config['pushover_app_token'] and config[ 'pushover_user_token']: utils.send_pushover( config['pushover_app_token'], config['pushover_user_token'], "local_folder_check_interval has been reset back to %d minutes after a 25 hour " "sleep due to ratelimits!" % config['local_folder_check_interval']) logger.debug("Checking size of %r", config['local_folder']) size = utils.folder_size(config['local_folder'], config['du_excludes']) if size is not None and size > 0: if size >= config['local_folder_size']: logger.debug("Local folder has %d gigabytes, %d too many!", size, size - config['local_folder_size']) # check if files are opened, skip this upload if so opened_files = utils.opened_files(config['local_folder'], config['lsof_excludes']) if opened_files: for item in opened_files: logger.debug("File is being accessed: %r", item) logger.debug( "Local folder has %d file(s) open, skipping upload until next check...", len(opened_files)) # send skip notification if config['pushover_app_token'] and config[ 'pushover_user_token']: utils.send_pushover( config['pushover_app_token'], config['pushover_user_token'], "Upload process of %d gigabytes temporarily skipped.\n" "%d file(s) are currently being accessed." % (size, len(opened_files))) continue # remove hidden before upload # (we don't want to delete a hidden from remote, after already replacing it) logger.debug("Purging _HIDDEN~ before upload commences") remove_hidden() # send start notification if config['pushover_app_token'] and config[ 'pushover_user_token']: utils.send_pushover( config['pushover_app_token'], config['pushover_user_token'], "Upload process started. %d gigabytes to upload." % size) # rclone move local_folder to local_remote logger.debug("Moving data from %r to %r...", config['local_folder'], config['local_remote']) upload_cmd = utils.rclone_move_command( config['local_folder'], config['local_remote'], config['rclone_transfers'], config['rclone_checkers'], config['rclone_bwlimit'], config['rclone_excludes'], config['rclone_chunk_size'], config['dry_run']) logger.debug("Using: %r", upload_cmd) start_time = timeit.default_timer() utils.run_command(upload_cmd, config) time_taken = timeit.default_timer() - start_time logger.debug("Moving finished in %s", utils.seconds_to_string(time_taken)) # remove empty directories if len(config['rclone_remove_empty_on_upload']): time.sleep(5) utils.remove_empty_directories(config) new_size = utils.folder_size(config['local_folder'], config['du_excludes']) logger.debug("Local folder is now left with %d gigabytes", new_size) # send finish notification if config['pushover_app_token'] and config[ 'pushover_user_token']: utils.send_pushover( config['pushover_app_token'], config['pushover_user_token'], "Upload process finished in %s. %d gigabytes left over." % (utils.seconds_to_string(time_taken), new_size)) else: logger.debug( "Local folder is still under the max size by %d gigabytes", config['local_folder_size'] - size) except Exception as ex: logger.exception("Exception occurred: ")
def handle(self, *args, **options): # REGISTERING SOURCE source, created = Source.objects.get_or_create(pk=SOURCE_ID, url=SOURCE_URL, slug=SOURCE_SLUG, name=SOURCE_NAME) logger.info("SOURCE: %s, Created: %s " % (source, created)) # pagesets as arguments sync = options.get('sync') get = options.get('get') nodownload = options.get('nodownload') force_download = options.get('force_download') range_values = options.get('range_values') # GET SPECIFIC ITEMS, AND DO SOMETHING if sync: if args: print "ARGS:",args user = args[0] language_code = YOUTUBE_USERS[user] language,created = Language.objects.get_or_create(code=language_code) # try to get the source from database. this will create source, created = Source.objects.get_or_create(pk=SOURCE_ID, url=SOURCE_URL, slug=SOURCE_SLUG, name=SOURCE_NAME) print "Source created?",created # get total of videos per user: try: logger.info("USER: %s, LANGUAGE: %s" % (user, language)) print "GETTING TOTAL OF VIDEOS..." BASE_URL = "https://gdata.youtube.com/feeds/api/users/%s/uploads" % user f = urllib2.urlopen(BASE_URL) data = f.read() f.close() p = parseString(data) a = p.getElementsByTagName('openSearch:totalResults') try: total_items = int(a[0].childNodes[0].data) logger.info("TOTAL VIDEOS: %d" % total_items) # loop in all items for index in range(1,total_items,50):#[0:1]: logger.info("ITEM INDEX ID: %d" % index) MOD_URL = BASE_URL + "?start-index=" + str(index) + "&max-results=50" logger.info("HITTING: %s" % MOD_URL) f = urllib2.urlopen(MOD_URL) data = f.read() f.close() p = parseString(data) urls = [] # debug print "URLS" for entry in p.getElementsByTagName("entry"): print entry.getElementsByTagName('id')[0].childNodes[0].data for entry in p.getElementsByTagName("entry"): url = entry.getElementsByTagName('id')[0].childNodes[0].data title = entry.getElementsByTagName('title')[0].childNodes[0].data youtubeid = url.split("/")[-1] youtube_url = "http://www.youtube.com/watch?v=%s" % youtubeid logger.info("URL to HIT: %s" % youtube_url) # get or create resource resource,created = Resource.objects.get_or_create( resource_reference_string=youtubeid, source=source, resource_url=youtube_url, language=language ) resource.category = Category.objects.filter(code__in=['video', 'video-class']) if not os.path.isdir(resource.content_root_path()): try: os.makedirs(resource.content_root_path()) except: print "ERROR! CANT CREATE %s!" % resource.content_root_path() raise logger.info("GRID: %s, CREATED: %s STATUS: %s" % (resource.id, created, resource.status)) if resource.status == "installed": logger.info("installed. passing") else: #get more data from youtube json_url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2&alt=jsonc" % youtubeid request = urllib2.urlopen(json_url) json_data = json.load(request) description = json_data['data'].get('description') duration = json_data['data'].get('duration') resource_pageviews = json_data['data'].get('viewCount') #tags = ', '.join(json_data['data'].get('tags')) tags = '' # download using : https://github.com/NFicano/python-youtube-download if downloader == "youtube.py": yt = youtube.YouTube() yt.url = youtube_url print "URL",yt.url yt.filename = youtubeid yt.filter("mp4")[0].download(resource.content_root_path(), youtubeid) resource.trigger = "%s.mp4" % yt.filename reload(youtube) else: dlcmd = 'python %s/youtube-dl.py -c --write-info-json --write-description -f 18 %s' % (settings.INSTANCE(), youtube_url) logger.info("COMMAND: %s" % dlcmd) resource.create_content_root() os.chdir(resource.content_root_path()) try: p = subprocess.call(dlcmd, shell=True) resource.status = "installed" resource.enabled = True resource.trigger = "%s.%s" % (youtubeid, "mp4") except: resource.enabled = False resource.status = "error" resource.tags = tags resource.title = title resource.author = "http://www.youtube.com/user/%s" % user resource.duration = duration resource.size = folder_size(resource.content_root_path()) resource.resource_pageviews = resource_pageviews resource.save() # generate thumbs resource.generate_thumb() except: print "ERROR!" raise except: raise print "ERROR, USER NOT LISTED ON SCRIPT" print "OPTIONS ARE --sync: %s" % ", ".join(YOUTUBE_USERS) else: print "OPTIONS ARE --sync: %s" % ", ".join(YOUTUBE_USERS)
def handle(self, *args, **options): # pagesets as arguments if args: try: start,finish = args[0].split(',') except: pass else: start,finish = 1,SOURCE_TOTAL_PAGES sync = options.get('sync') get = options.get('get') nodownload = options.get('nodownload') force_download = options.get('force_download') range_values = options.get('range_values') if get: grids = get[0].split(",") for grid in grids: print "GRID:",grid resource = Resource.objects.get(pk=grid) resourceitem = PortalDoProfessorItem(resource, False) print "GRID TITLE: %s" % resourceitem.title resourceitem.download() if sync: # try to get the source from database source, created = Source.objects.get_or_create(pk=SOURCE_ID, url=SOURCE_URL, slug=SOURCE_SLUG, name=SOURCE_NAME) logger.info("Source created? %s" %created) print "TOTAL PAGES: %s" % SOURCE_TOTAL_PAGES all_pages = range(SOURCE_TOTAL_PAGES) #all_pages.reverse() for page in all_pages[int(start):int(finish)]: logger.info("PAGE %s" % page) url = "http://portaldoprofessor.mec.gov.br/recursos.html?pagina=%s&tamanhoPagina=%s&ajax" % (page, SOURCE_ITEMS_PER_PAGE) logger.info("hitting %s" % url) f = urllib.urlopen(url) s = f.read() f.close() logger.info("parsing...") print "parsing..." soup = BeautifulSoup(s) in_page_items = len(soup.findAll('tr')) logger.info("IN_PAGE_ITEMS: %s" % in_page_items) # for each individual resource i = 0 for resource_item in range(1,in_page_items): logger.info("#######"*4) try: id = soup('tr')[resource_item].first('a').attrs[0][1].split('=')[1] cat = soup('tr')[resource_item].findAll('img')[0].attrs[0][1].split("/")[1].split("_")[1].split(".")[0] except: id = "error%s" % i i += 1 cat = '' # resource informations resource_url = "%sfichaTecnica.html?id=%s" % (SOURCE_URL, id) resource,created = Resource.objects.get_or_create( resource_reference_string=id, source=source, resource_url=resource_url ) first_status = resource.status logger.info("Created? %s" % created) logger.info("DBITEM? %s" % resource.pk) logger.info("PAGE? %s" % page) logger.info("FIRST STATUS: %s" % first_status) try: category_object = Category.objects.get(pk=CATEGORY_DICT[cat]) except: category_object = "" if resource.status != 'installed' and resource.status != 'downloaded' and resource.status != 'error': resource.status = "processing" # START CLASS r = PortalDoProfessorItem(resource, created) r.parse() logger.info("TITLE: %s" % r.title) try: r.resource.category.add(category_object) except: pass try: r.save() except Exception, e: logger.error('ERROR PARSING ID: %d', r.resource.pk) r.resource.status = 'error' logger.error('EXCEPTION: %s', e) # even here the tag field can be truncated # and break the save try: r.save() except Exception, e: logger.error('EXCEPTION: %s', e) r.resource.tags = '' try: r.resource.save() except: pass if nodownload: logger.info("NOT DOWNLOADING! STATUS: %s" % r.resource.status) else: if force_download or first_status != 'downloaded' and first_status != 'installed' and first_status != 'error': try: logger.info("FORCING DOWNLOAD? %s, FIRST STATUS: %s" % (force_download, first_status)) r.download() r.resource.status = 'downloaded' r.resource.save() r.finish() except Exception, e: logger.error("ERROR DOWNLOADING") logger.error('EXCEPTION: %s', e) r.resource.status = 'error' try: r.resource.save() except Exception, e: logger.error('EXCEPTION: %s', e) logger.error("DEAD END") pass else: logger.info("-- CONTENT ALREADY MARKED AS DOWNLOADED") r.size = folder_size(r.resource.content_root()) try: r.save() except: pass
def handle(self, *args, **options): # pagesets as arguments if args: try: start, finish = args[0].split(',') except: pass else: start, finish = 1, SOURCE_TOTAL_PAGES sync = options.get('sync') get = options.get('get') nodownload = options.get('nodownload') force_download = options.get('force_download') range_values = options.get('range_values') if get: grids = get[0].split(",") for grid in grids: print "GRID:", grid resource = Resource.objects.get(pk=grid) resourceitem = PortalDoProfessorItem(resource, False) print "GRID TITLE: %s" % resourceitem.title resourceitem.download() if sync: # try to get the source from database source, created = Source.objects.get_or_create(pk=SOURCE_ID, url=SOURCE_URL, slug=SOURCE_SLUG, name=SOURCE_NAME) logger.info("Source created? %s" % created) print "TOTAL PAGES: %s" % SOURCE_TOTAL_PAGES all_pages = range(SOURCE_TOTAL_PAGES) #all_pages.reverse() for page in all_pages[int(start):int(finish)]: logger.info("PAGE %s" % page) url = "http://portaldoprofessor.mec.gov.br/recursos.html?pagina=%s&tamanhoPagina=%s&ajax" % ( page, SOURCE_ITEMS_PER_PAGE) logger.info("hitting %s" % url) f = urllib.urlopen(url) s = f.read() f.close() logger.info("parsing...") print "parsing..." soup = BeautifulSoup(s) in_page_items = len(soup.findAll('tr')) logger.info("IN_PAGE_ITEMS: %s" % in_page_items) # for each individual resource i = 0 for resource_item in range(1, in_page_items): logger.info("#######" * 4) try: id = soup('tr')[resource_item].first( 'a').attrs[0][1].split('=')[1] cat = soup('tr')[resource_item].findAll( 'img')[0].attrs[0][1].split("/")[1].split( "_")[1].split(".")[0] except: id = "error%s" % i i += 1 cat = '' # resource informations resource_url = "%sfichaTecnica.html?id=%s" % (SOURCE_URL, id) resource, created = Resource.objects.get_or_create( resource_reference_string=id, source=source, resource_url=resource_url) first_status = resource.status logger.info("Created? %s" % created) logger.info("DBITEM? %s" % resource.pk) logger.info("PAGE? %s" % page) logger.info("FIRST STATUS: %s" % first_status) try: category_object = Category.objects.get( pk=CATEGORY_DICT[cat]) except: category_object = "" if resource.status != 'installed' and resource.status != 'downloaded' and resource.status != 'error': resource.status = "processing" # START CLASS r = PortalDoProfessorItem(resource, created) r.parse() logger.info("TITLE: %s" % r.title) try: r.resource.category.add(category_object) except: pass try: r.save() except Exception, e: logger.error('ERROR PARSING ID: %d', r.resource.pk) r.resource.status = 'error' logger.error('EXCEPTION: %s', e) # even here the tag field can be truncated # and break the save try: r.save() except Exception, e: logger.error('EXCEPTION: %s', e) r.resource.tags = '' try: r.resource.save() except: pass if nodownload: logger.info("NOT DOWNLOADING! STATUS: %s" % r.resource.status) else: if force_download or first_status != 'downloaded' and first_status != 'installed' and first_status != 'error': try: logger.info( "FORCING DOWNLOAD? %s, FIRST STATUS: %s" % (force_download, first_status)) r.download() r.resource.status = 'downloaded' r.resource.save() r.finish() except Exception, e: logger.error("ERROR DOWNLOADING") logger.error('EXCEPTION: %s', e) r.resource.status = 'error' try: r.resource.save() except Exception, e: logger.error('EXCEPTION: %s', e) logger.error("DEAD END") pass else: logger.info( "-- CONTENT ALREADY MARKED AS DOWNLOADED") r.size = folder_size(r.resource.content_root()) try: r.save() except: pass
def index(request): pip = os.path.join(sys.exec_prefix, 'bin', 'pip') if not os.path.isfile(pip): pip = 'pip' SHELL_COMMANDS = (('Hostname', 'hostname'), ('hg version', 'hg id'), ('git version', "git log --pretty=format:'%h' -n 1"), ('hg branch', 'hg branch'), ('git branch', 'git rev-parse --abbrev-ref HEAD'), ('MySQL version', 'mysql --version'), ('Local Packages', '%s freeze -l' % pip)) SD = OrderedDict() for k, v in sorted(settings_list(), key=lambda x: x[0]): SD[k] = v context = RequestContext(request, { 'args': sys.argv, 'exe': sys.executable, 'settings': SD, }) context['versions'] = OrderedDict() # get versions curr_dir = os.path.realpath(os.path.dirname(__file__)) for name, shell_command in SHELL_COMMANDS: try: result = utils.run_shell_command(shell_command, curr_dir) if result: if isinstance(result, list): result = '<br>'.split(result) context['versions'][name] = result except: pass # machine status context['machine'] = OrderedDict() if sys.platform == 'darwin': context['machine']['Uptime'] = 'not done yet on MacOS' context['machine']['Disk Space'] = 'not done yet on MacOS' elif sys.platform == 'win32': context['machine']['Uptime'] = 'not done yet on Windows' context['machine']['Disk Space'] = 'not done yet on Windows' else: context['machine']['Uptime'] = utils.server_uptime() context['machine']['Disk Space'] = utils.disk_usage('/')._asdict() if os.path.exists(settings.MEDIA_ROOT): context['machine']['Media Folder'] = utils.sizeof_fmt( utils.folder_size(settings.MEDIA_ROOT)) context['stats'] = utils.get_available_stats() context['apps'] = [(app.__name__, ', '.join([model.__name__ for model in models])) for app, models in all_concrete_models()] context['relations'] = [[(model.__name__, ', '.join([ '%s (%s) through %s' % (relation.__name__, relation.__module__, field.__class__.__name__) for field, relation in relations ]), app.__name__) for model, relations in rel_info] for app, rel_info in all_relations()] #context['rel_graph'] = context['config_warnings'] = utils.get_configuration_warnings() return render_to_response('dashboard/index.html', context)
def org_clone(org): """Clone all public non-forked repos from the specified org. Repos are cloned to subfolders under the 'folder' setting in config.json. """ # optional list of org/repos to be skipped ... if os.path.isfile("skiplist.txt"): skiplist = open("skiplist.txt").read().lower().splitlines() else: skiplist = [] print("Org".ljust(21) + "Repo".ljust(61) + "KB estimate KB actual seconds KB/sec") print(20 * "-" + " " + 60 * "-" + " " + "----------- ----------- ------- -------") # if log file doesn't exist, create it logfile = os.path.join(SETTINGS["folder"], "logfile.csv") if not os.path.isfile(logfile): open(logfile, "w").write( "datetime,org,repo,KB-estimate,KB-actual,seconds,KB/second\n") org_folder = os.path.join(SETTINGS["folder"], org) if SETTINGS["overwrite"]: folder_del(org_folder) # delete existing org data os.makedirs(org_folder) # create empty org folder else: # In non-overwrite mode, only create org folder if it doesn't exist. if not os.path.exists(org_folder): os.makedirs(org_folder) tot_estimate = 0 # total estimated repo size (from GitHub API) tot_actual = 0 # total actual size on disk tot_seconds = 0 # total elapsed time for repo, size_api in repolist(org): if f"{org}/{repo}".lower() in skiplist: continue # repos in skiplist are not cloned start = default_timer() folder = os.path.join(org_folder, repo) if not SETTINGS["overwrite"]: # Don't clone this repo if target folder exists and is non-empty. if non_empty_folder(folder): continue print(f"{org:20} {repo:60} ", end="") Repo.clone_from("https://github.com/" + org + "/" + repo + ".git", folder) size_actual = folder_size(folder) / 1024 elapsed = default_timer() - start tot_estimate += size_api tot_actual += size_actual tot_seconds += elapsed print( f"{size_api:9,.0f} {size_actual:9,.0f} {elapsed:7.2f} {size_actual/elapsed:7.0f}" ) timestamp = str(datetime.datetime.now())[:19] open(logfile, "a").write(",".join([ timestamp, org, repo, str(round(size_api)), str(round(size_actual)), str(round(elapsed, 2)), str(round(size_actual / elapsed)), ]) + "\n") avg_kb_per_second = 0 if tot_seconds == 0 else tot_actual / tot_seconds print("TOTALS:".rjust(84) + f"{tot_estimate:9,.0f} {tot_actual:9,.0f} " f"{tot_seconds:7.2f} {avg_kb_per_second:7.0f}\n")