def main(): """Main function""" script_name = os.path.basename(__file__)[:-3] parser = argparse.ArgumentParser( description="Remove directories of repositories that were zipped") parser.add_argument("-v", "--verbose", type=int, default=config.VERBOSE, help="increase output verbosity") parser.add_argument("-i", "--interval", type=int, nargs=2, default=config.REPOSITORY_INTERVAL, help="id interval") parser.add_argument("-c", "--count", action='store_true', help="count results") parser.add_argument('-r', '--reverse', action='store_true', help='iterate in reverse order') parser.add_argument('--check', type=str, nargs='*', default={'all', script_name, script_name + '.py'}, help='check name in .exit') args = parser.parse_args() config.VERBOSE = args.verbose status = None if not args.count: status = StatusLogger(script_name) status.report() with connect() as session, savepid(): apply( session, status, args.count, args.interval, args.reverse, set(args.check) )
def main(): """Main function""" with savepid(): scripts = sys.argv[1:] result = [] try: with mount_basedir(): config.BASE_DIR.mkdir(parents=True, exist_ok=True) config.LOGS_DIR.mkdir(parents=True, exist_ok=True) indexes = [ index for index, name in enumerate(scripts) if name == "--all" or config.Path(name).exists() or config.Path(name + ".py").exists() ] indexes.sort() indexes.append(None) it_indexes = iter(indexes) next(it_indexes) to_execute = { scripts[cur]: scripts[cur + 1:nex] for cur, nex in zip(indexes, it_indexes) } options_to_all = [] if "--all" in to_execute: options_to_all = to_execute["--all"] del to_execute["--all"] if not to_execute: to_execute = {script: [] for script in ORDER} for script, args in to_execute.items(): if check_exit({"all", "main", "main.py"}): print("Found .exit file. Exiting") return if script.endswith(".py"): script = script[:-3] args = args + options_to_all status = execute_script(script, args) result.append("{} {} --> {}".format(script, " ".join(args), status)) print("done") finally: status = StatusLogger("main closed") status.report() if config.EMAIL_TO: yag = yagmail.SMTP(config.EMAIL_LOGIN, oauth2_file=str(config.OAUTH_FILE)) yag.send(to=config.EMAIL_TO.split(";"), subject="{} is idle".format(config.MACHINE), contents="{} finished at {}\n\nScripts:\n".format( config.MACHINE, datetime.now().strftime("%Y%m%dT%H%M%S")) + "\n".join(result))
def __init__(self, github=None): self.github = github or Github(config.GITHUB_USERNAME, config.GITHUB_PASSWORD) self.status = StatusLogger("repository_crawler") self.status.report() self.first_date = config.FIRST_DATE self.last_date = None self.delta = None self.page = -1 self.query = "" self.reset_page = True
def main(): """Main function""" try: config.BASE_DIR.mkdir(parents=True, exist_ok=True) config.LOGS_DIR.mkdir(parents=True, exist_ok=True) status = 1 while status == 1: status = execute_script("s0_repository_crawler", []) print("done") finally: status = StatusLogger("main_download closed") status.report()
def main(): """Main function""" script_name = os.path.basename(__file__)[:-3] parser = argparse.ArgumentParser(description="Clone deleted repositories") parser.add_argument("-v", "--verbose", type=int, default=config.VERBOSE, help="increase output verbosity") parser.add_argument("-e", "--retry-errors", action='store_true', help="retry errors") parser.add_argument("-i", "--interval", type=int, nargs=2, default=config.REPOSITORY_INTERVAL, help="id interval") parser.add_argument("-c", "--count", action='store_true', help="count results") parser.add_argument("-d", "--dry-run", action='store_true', help="discover repositories but do not clone") parser.add_argument("-l", "--list", action='store_true', help="list repositories but do not clone nor discover") parser.add_argument('-r', '--reverse', action='store_true', help='iterate in reverse order') parser.add_argument('--check', type=str, nargs='*', default={'all', script_name, script_name + '.py'}, help='check name in .exit') args = parser.parse_args() config.VERBOSE = args.verbose status = None if not args.count: status = StatusLogger(script_name) status.report() with connect() as session, savepid(): apply(session, status, 0 if args.retry_errors else consts.R_FAILED_TO_CLONE, args.dry_run, args.list, args.count, args.interval, args.reverse, set(args.check))
def main(): """Main function""" script_name = os.path.basename(__file__)[:-3] parser = argparse.ArgumentParser(description='Execute repositories') parser.add_argument('-v', '--verbose', type=int, default=config.VERBOSE, help='increase output verbosity') parser.add_argument('-e', '--retry-errors', action='store_true', help='retry errors') parser.add_argument('-i', '--interval', type=int, nargs=2, default=config.REPOSITORY_INTERVAL, help='repository id interval') parser.add_argument('-c', '--count', action='store_true', help='count results') parser.add_argument('-r', '--reverse', action='store_true', help='iterate in reverse order') parser.add_argument('--check', type=str, nargs='*', default={'all', script_name, script_name + '.py'}, help='check name in .exit') args = parser.parse_args() config.VERBOSE = args.verbose status = None if not args.count: status = StatusLogger(script_name) status.report() with connect() as session, savepid(): apply(session, status, args.count, args.interval, args.reverse, set(args.check))
class Querier(object): """Queries github""" def __init__(self, github=None): self.github = github or Github(config.GITHUB_USERNAME, config.GITHUB_PASSWORD) self.status = StatusLogger("repository_crawler") self.status.report() self.first_date = config.FIRST_DATE self.last_date = None self.delta = None self.page = -1 self.query = "" self.reset_page = True def initialize_date(self): """Initialize last_date and delta""" if self.reset_page: self.delta = None if self.last_date is None: self.last_date = self.first_date + timedelta(365) if self.delta is None: self.delta = timedelta(365) if self.first_date is not None: self.delta = (self.last_date - self.first_date) / 2 def query_repositories(self): """Query repositories""" self.initialize_date() query = ['language:"Jupyter Notebook"'] if self.first_date is None: query.append("created:<=" + time(self.last_date)) else: query.append("created:{}..{}".format(time(self.first_date), time(self.last_date))) self.query = " ".join(query) pagination = self.github.search_repositories(self.query, order="desc") count = pagination.totalCount if config.VERBOSE > 1: print("> Adjusting query {!r} (count = {})".format( self.query, count)) if count < 500 and self.last_date < datetime.now(): self.last_date += self.delta self.delta *= 1.5 return self.query_repositories() if count >= 1000: self.last_date -= self.delta self.delta /= 2 return self.query_repositories() if self.reset_page: self.page = 0 self.reset_page = True if config.VERBOSE > 0: print("Query executed with {} results: {!r}".format( count, self.query)) return pagination, count def next_range(self): """Adjust range to next result""" self.first_date = self.last_date self.last_date += self.delta self.delta = None def iterate_repository_pagination(self, session, pagination, count): """Iterate on repository pagination""" pages = int(count / 30) for self.page in range(self.page, pages): if check_exit( {"all", "repository_crawler", "repository_crawler.py"}): raise RuntimeError("Found .exit file. Exiting") if folder_size(str(config.BASE_DIR)) > config.MAX_SIZE: raise RuntimeError("Content folder is too big. Clean it up") if config.VERBOSE > 1: print("> Processing page {}".format(self.page)) repositories = pagination.get_page(self.page) for repository in repositories: load_repository(session, "github.com", repository.full_name) session.commit() self.status.count += 1 self.status.report() query = Query( name="repository", query=self.query, first_date=self.first_date, last_date=self.last_date, delta=self.delta, count=count, ) session.add(query) session.commit() if config.VERBOSE > 0: print("> Finished query. ID={}".format(query.id)) def recover(self, session): """Recover information from .stop.json or database""" strptime = datetime.strptime if os.path.exists(".stop.json"): with open(".stop.json", "rb") as stop_file: dic = json.load(stop_file) if not dic["delta"].startswith("000"): dic["delta"] = "000" + dic["delta"] self.delta = strptime(dic["delta"], FORMAT) - datetime.min self.last_date = strptime(dic["last_date"], FORMAT) self.first_date = None if dic["first_date"]: self.first_date = strptime(dic["first_date"], FORMAT) self.page = dic["page"] self.reset_page = False self.query = dic["query"] return True the_query = session.query(Query).order_by(desc(Query.last_date)) query = the_query.first() if query: self.last_date = query.last_date self.first_date = query.first_date self.page = 0 self.reset_page = True self.query = query.query self.delta = query.delta self.next_range() return True return False def save(self): """Save .stop.json""" stop = { "query": self.query, "last_date": time(self.last_date), "first_date": time(self.first_date), "delta": time(datetime.min + self.delta), "page": self.page, } with open(".stop.json", "w", encoding="utf-8") as stop_file: json.dump(stop, stop_file) def search_repositories(self): """Search repositories""" with connect() as session, mount_basedir(): try: if not self.recover(session): self.iterate_repository_pagination( session, *self.query_repositories()) self.next_range() while self.last_date < datetime.now(): self.iterate_repository_pagination( session, *self.query_repositories()) self.next_range() except Exception as err: # pylint: disable=broad-except self.save() print("Stopped due {}. File '.stop.json' created".format(err)) import traceback if config.VERBOSE > 1: traceback.print_exc() if str(err) == "Content folder is too big. Clean it up": sys.exit(2) else: sys.exit(1)