Example #1
0
def main():
    """Main function"""
    script_name = os.path.basename(__file__)[:-3]
    parser = argparse.ArgumentParser(
        description="Remove directories of repositories that were zipped")
    parser.add_argument("-v", "--verbose", type=int, default=config.VERBOSE,
                        help="increase output verbosity")
    parser.add_argument("-i", "--interval", type=int, nargs=2,
                        default=config.REPOSITORY_INTERVAL,
                        help="id interval")
    parser.add_argument("-c", "--count", action='store_true',
                        help="count results")
    parser.add_argument('-r', '--reverse', action='store_true',
                        help='iterate in reverse order')
    parser.add_argument('--check', type=str, nargs='*',
                        default={'all', script_name, script_name + '.py'},
                        help='check name in .exit')
    args = parser.parse_args()
    config.VERBOSE = args.verbose
    status = None
    if not args.count:
        status = StatusLogger(script_name)
        status.report()

    with connect() as session, savepid():
        apply(
            session,
            status,
            args.count,
            args.interval,
            args.reverse,
            set(args.check)
        )
Example #2
0
def main():
    """Main function"""
    with savepid():
        scripts = sys.argv[1:]

        result = []

        try:
            with mount_basedir():
                config.BASE_DIR.mkdir(parents=True, exist_ok=True)
            config.LOGS_DIR.mkdir(parents=True, exist_ok=True)

            indexes = [
                index for index, name in enumerate(scripts)
                if name == "--all" or config.Path(name).exists()
                or config.Path(name + ".py").exists()
            ]
            indexes.sort()
            indexes.append(None)
            it_indexes = iter(indexes)
            next(it_indexes)
            to_execute = {
                scripts[cur]: scripts[cur + 1:nex]
                for cur, nex in zip(indexes, it_indexes)
            }
            options_to_all = []
            if "--all" in to_execute:
                options_to_all = to_execute["--all"]
                del to_execute["--all"]

            if not to_execute:
                to_execute = {script: [] for script in ORDER}

            for script, args in to_execute.items():
                if check_exit({"all", "main", "main.py"}):
                    print("Found .exit file. Exiting")
                    return
                if script.endswith(".py"):
                    script = script[:-3]
                args = args + options_to_all
                status = execute_script(script, args)
                result.append("{} {} --> {}".format(script, " ".join(args),
                                                    status))
            print("done")
        finally:
            status = StatusLogger("main closed")
            status.report()
            if config.EMAIL_TO:
                yag = yagmail.SMTP(config.EMAIL_LOGIN,
                                   oauth2_file=str(config.OAUTH_FILE))
                yag.send(to=config.EMAIL_TO.split(";"),
                         subject="{} is idle".format(config.MACHINE),
                         contents="{} finished at {}\n\nScripts:\n".format(
                             config.MACHINE,
                             datetime.now().strftime("%Y%m%dT%H%M%S")) +
                         "\n".join(result))
    def __init__(self, github=None):
        self.github = github or Github(config.GITHUB_USERNAME,
                                       config.GITHUB_PASSWORD)
        self.status = StatusLogger("repository_crawler")
        self.status.report()

        self.first_date = config.FIRST_DATE
        self.last_date = None
        self.delta = None
        self.page = -1
        self.query = ""
        self.reset_page = True
Example #4
0
def main():
    """Main function"""
    try:
        config.BASE_DIR.mkdir(parents=True, exist_ok=True)
        config.LOGS_DIR.mkdir(parents=True, exist_ok=True)
        status = 1
        while status == 1:
            status = execute_script("s0_repository_crawler", [])
        print("done")
    finally:
        status = StatusLogger("main_download closed")
        status.report()
Example #5
0
def main():
    """Main function"""
    script_name = os.path.basename(__file__)[:-3]
    parser = argparse.ArgumentParser(description="Clone deleted repositories")
    parser.add_argument("-v",
                        "--verbose",
                        type=int,
                        default=config.VERBOSE,
                        help="increase output verbosity")
    parser.add_argument("-e",
                        "--retry-errors",
                        action='store_true',
                        help="retry errors")
    parser.add_argument("-i",
                        "--interval",
                        type=int,
                        nargs=2,
                        default=config.REPOSITORY_INTERVAL,
                        help="id interval")
    parser.add_argument("-c",
                        "--count",
                        action='store_true',
                        help="count results")
    parser.add_argument("-d",
                        "--dry-run",
                        action='store_true',
                        help="discover repositories but do not clone")
    parser.add_argument("-l",
                        "--list",
                        action='store_true',
                        help="list repositories but do not clone nor discover")
    parser.add_argument('-r',
                        '--reverse',
                        action='store_true',
                        help='iterate in reverse order')
    parser.add_argument('--check',
                        type=str,
                        nargs='*',
                        default={'all', script_name, script_name + '.py'},
                        help='check name in .exit')
    args = parser.parse_args()
    config.VERBOSE = args.verbose
    status = None
    if not args.count:
        status = StatusLogger(script_name)
        status.report()

    with connect() as session, savepid():
        apply(session, status,
              0 if args.retry_errors else consts.R_FAILED_TO_CLONE,
              args.dry_run, args.list, args.count, args.interval, args.reverse,
              set(args.check))
def main():
    """Main function"""
    script_name = os.path.basename(__file__)[:-3]
    parser = argparse.ArgumentParser(description='Execute repositories')
    parser.add_argument('-v',
                        '--verbose',
                        type=int,
                        default=config.VERBOSE,
                        help='increase output verbosity')
    parser.add_argument('-e',
                        '--retry-errors',
                        action='store_true',
                        help='retry errors')
    parser.add_argument('-i',
                        '--interval',
                        type=int,
                        nargs=2,
                        default=config.REPOSITORY_INTERVAL,
                        help='repository id interval')
    parser.add_argument('-c',
                        '--count',
                        action='store_true',
                        help='count results')
    parser.add_argument('-r',
                        '--reverse',
                        action='store_true',
                        help='iterate in reverse order')
    parser.add_argument('--check',
                        type=str,
                        nargs='*',
                        default={'all', script_name, script_name + '.py'},
                        help='check name in .exit')

    args = parser.parse_args()
    config.VERBOSE = args.verbose
    status = None
    if not args.count:
        status = StatusLogger(script_name)
        status.report()

    with connect() as session, savepid():
        apply(session, status, args.count, args.interval, args.reverse,
              set(args.check))
class Querier(object):
    """Queries github"""
    def __init__(self, github=None):
        self.github = github or Github(config.GITHUB_USERNAME,
                                       config.GITHUB_PASSWORD)
        self.status = StatusLogger("repository_crawler")
        self.status.report()

        self.first_date = config.FIRST_DATE
        self.last_date = None
        self.delta = None
        self.page = -1
        self.query = ""
        self.reset_page = True

    def initialize_date(self):
        """Initialize last_date and delta"""
        if self.reset_page:
            self.delta = None
        if self.last_date is None:
            self.last_date = self.first_date + timedelta(365)
        if self.delta is None:
            self.delta = timedelta(365)
            if self.first_date is not None:
                self.delta = (self.last_date - self.first_date) / 2

    def query_repositories(self):
        """Query repositories"""
        self.initialize_date()
        query = ['language:"Jupyter Notebook"']
        if self.first_date is None:
            query.append("created:<=" + time(self.last_date))
        else:
            query.append("created:{}..{}".format(time(self.first_date),
                                                 time(self.last_date)))
        self.query = " ".join(query)
        pagination = self.github.search_repositories(self.query, order="desc")
        count = pagination.totalCount
        if config.VERBOSE > 1:
            print("> Adjusting query {!r} (count = {})".format(
                self.query, count))
        if count < 500 and self.last_date < datetime.now():
            self.last_date += self.delta
            self.delta *= 1.5
            return self.query_repositories()
        if count >= 1000:
            self.last_date -= self.delta
            self.delta /= 2
            return self.query_repositories()
        if self.reset_page:
            self.page = 0
        self.reset_page = True
        if config.VERBOSE > 0:
            print("Query executed with {} results: {!r}".format(
                count, self.query))
        return pagination, count

    def next_range(self):
        """Adjust range to next result"""
        self.first_date = self.last_date
        self.last_date += self.delta
        self.delta = None

    def iterate_repository_pagination(self, session, pagination, count):
        """Iterate on repository pagination"""
        pages = int(count / 30)
        for self.page in range(self.page, pages):
            if check_exit(
                {"all", "repository_crawler", "repository_crawler.py"}):
                raise RuntimeError("Found .exit file. Exiting")
            if folder_size(str(config.BASE_DIR)) > config.MAX_SIZE:
                raise RuntimeError("Content folder is too big. Clean it up")
            if config.VERBOSE > 1:
                print("> Processing page {}".format(self.page))
            repositories = pagination.get_page(self.page)
            for repository in repositories:
                load_repository(session, "github.com", repository.full_name)
                session.commit()
                self.status.count += 1
                self.status.report()
        query = Query(
            name="repository",
            query=self.query,
            first_date=self.first_date,
            last_date=self.last_date,
            delta=self.delta,
            count=count,
        )
        session.add(query)
        session.commit()
        if config.VERBOSE > 0:
            print("> Finished query. ID={}".format(query.id))

    def recover(self, session):
        """Recover information from .stop.json or database"""
        strptime = datetime.strptime
        if os.path.exists(".stop.json"):
            with open(".stop.json", "rb") as stop_file:
                dic = json.load(stop_file)
                if not dic["delta"].startswith("000"):
                    dic["delta"] = "000" + dic["delta"]
                self.delta = strptime(dic["delta"], FORMAT) - datetime.min
                self.last_date = strptime(dic["last_date"], FORMAT)
                self.first_date = None
                if dic["first_date"]:
                    self.first_date = strptime(dic["first_date"], FORMAT)
                self.page = dic["page"]
                self.reset_page = False
                self.query = dic["query"]
            return True
        the_query = session.query(Query).order_by(desc(Query.last_date))
        query = the_query.first()
        if query:
            self.last_date = query.last_date
            self.first_date = query.first_date
            self.page = 0
            self.reset_page = True
            self.query = query.query
            self.delta = query.delta
            self.next_range()
            return True
        return False

    def save(self):
        """Save .stop.json"""
        stop = {
            "query": self.query,
            "last_date": time(self.last_date),
            "first_date": time(self.first_date),
            "delta": time(datetime.min + self.delta),
            "page": self.page,
        }
        with open(".stop.json", "w", encoding="utf-8") as stop_file:
            json.dump(stop, stop_file)

    def search_repositories(self):
        """Search repositories"""
        with connect() as session, mount_basedir():
            try:
                if not self.recover(session):
                    self.iterate_repository_pagination(
                        session, *self.query_repositories())
                    self.next_range()
                while self.last_date < datetime.now():
                    self.iterate_repository_pagination(
                        session, *self.query_repositories())
                    self.next_range()
            except Exception as err:  # pylint: disable=broad-except
                self.save()
                print("Stopped due {}. File '.stop.json' created".format(err))
                import traceback
                if config.VERBOSE > 1:
                    traceback.print_exc()
                if str(err) == "Content folder is too big. Clean it up":
                    sys.exit(2)
                else:
                    sys.exit(1)