def abort_if_not_initialized(self): '''Check that the git repository exists and exit otherwise''' if not self.git_initialized(): message = "You don't seem to have initialized {} for backup." print_stderr(message.format(self.directory_to_backup)) message = "Please use '{} init' to initialize it" print_stderr(message.format(self.get_invocation())) sys.exit(Errors.REPOSITORY_NOT_INITIALIZED)
def switch_to_correct_branch(self): self.set_HEAD_to(self.branch) self.abort_unless_HEAD_exists() # Also reset the index to match HEAD. Otherwise things go # horribly wrong when switching from backing up one computer to # another, since the index is still that from the first one. msg = "Now working on a new branch, so resetting the index to match..." print_stderr(msg) check_call(self.git(["read-tree","HEAD"]))
def print_settings(self): print_stderr('''Settings for backup: backing up the directory {} (set from the {}) ... to the branch "{}" (set from the {}) ... in the git repository {} (set from the {})'''.format( self.directory_to_backup, OptionFrom.string_versions[self.directory_to_backup_from], self.branch, OptionFrom.string_versions[self.branch_from], self.git_directory, OptionFrom.string_versions[self.git_directory_from]), )
def currently_on_correct_branch(self): '''Return True if HEAD currently points to 'self.branch', and return False otherwise.''' p = Popen(self.git(["symbolic-ref","HEAD"]),stdout=PIPE) c = p.communicate() if 0 != p.returncode: print_stderr("Finding what HEAD points to failed") sys.exit(Errors.FINDING_HEAD) result = c[0].decode().strip() if self.branch == result: return True elif ("refs/heads/"+self.branch) == result: return True else: return False
def abort_unless_particular_config(self,key,required_value): '''Unless the git config has "required_value" set for "key", exit.''' current_value = self.config_value(key) if current_value: if current_value != required_value: message = "The current value for {} is {}, should be: {}" print_stderr(message.format( key, current_value, required_value )) sys.exit(Errors.GIT_CONFIG_ERROR) else: message = "The {} config option was not set, setting to {}" print_stderr(message.format(key,required_value)) self.set_config_value(key,required_value)
def fetch_url(self, url, force=False, cookie_jar=None): if force is False and self.data_cacher.has_cached(url, self.cache_expiry_time): cache_file_path = self.data_cacher.get_cache_file_path(url) print_stderr('Using cached version of "'+url+'"') print_stderr(' ('+cache_file_path+')') url_data = self.data_cacher.load(url) return url_data print_stderr('Requesting data from "'+url+'"') response = requests.get(url, cookies=cookie_jar) response_text = response.text response_text = self.replace_fancy_chars(response_text) if force is False: self.data_cacher.save(url, response_text) return response_text
def abort_unless_HEAD_exists(self): if not self.check_ref("HEAD"): message = '''The branch you are trying to back up to does not exist. (Perhaps you haven't run "{} init")''' print_stderr(message.format(self.get_invocation())) sys.exit(Errors.NO_SUCH_BRANCH)
def __init__(self, command_line_options): self.configuration_file = '.gib.conf' self.directory_to_backup = None self.directory_to_backup_from = None self.git_directory = None self.git_directory_from = None self.branch = None self.branch_from = None if command_line_options.directory: self.directory_to_backup = command_line_options.directory self.directory_to_backup_from = OptionFrom.COMMAND_LINE else: if 'HOME' not in os.environ: # Then we can't use HOME as default directory: print_stderr("The HOME environment variable was not set") sys.exit(Errors.STRANGE_ENVIRONMENT) self.directory_to_backup = os.environ['HOME'] self.directory_to_backup_from = OptionFrom.DEFAULT_VALUE # We need to make sure that this is an absolute path before # changing directory: self.directory_to_backup = os.path.abspath(self.directory_to_backup) if not exists_and_is_directory(self.directory_to_backup): sys.exit(Errors.DIRECTORY_TO_BACKUP_MISSING) # Now we know the directory that we're backing up, try to load the # config file: configuration = RawConfigParser() configuration.read(os.path.join(self.directory_to_backup, self.configuration_file)) # Now set the git directory: if command_line_options.git_directory: self.git_directory = command_line_options.git_directory self.git_directory_from = OptionFrom.COMMAND_LINE elif configuration.has_option('repository','git_directory'): self.git_directory = configuration.get( 'repository','git_directory' ) self.git_directory_from = OptionFrom.CONFIGURATION_FILE else: self.git_directory = os.path.join(self.directory_to_backup,'.git') self.git_directory_from = OptionFrom.DEFAULT_VALUE if not os.path.isabs(self.git_directory): print_stderr("The git directory must be an absolute path.") sys.exit(Errors.GIT_DIRECTORY_RELATIVE) # And finally the branch: if command_line_options.branch: self.branch = command_line_options.branch self.branch_from = OptionFrom.COMMAND_LINE elif configuration.has_option('repository','branch'): self.branch = configuration.get('repository','branch') self.branch_from = OptionFrom.CONFIGURATION_FILE else: self.branch = 'master' self.branch_from = OptionFrom.DEFAULT_VALUE # Check that the git_directory ends in '.git': if not re.search('\.git/*$',self.git_directory): message = "The git directory ({}) did not end in '.git'" print_stderr(message.format(self.git_directory)) sys.exit(Errors.BAD_GIT_DIRECTORY) # Also check that it actually exists: if not os.path.exists(self.git_directory): message = "The git directory '{}' does not exist." print_stderr(message.format(self.git_directory)) sys.exit(Errors.GIT_DIRECTORY_MISSING)
# Record timestamps at various places in the script to provide timing information. TIMESTAMPS = {} TIMESTAMPS['Started script'] = time.time() # Allow config override if a config file path is supplied. if config_file_path is not None: config_file = open(config_file_path, 'r') config = json.loads(config_file.read()) for setting in config: if setting in CONFIG: CONFIG[setting] = config[setting] # Fetch all text data from all story download URLs obtained. print_stderr('Fetching stories text...') stories_text = sys.stdin.read() # Read in any lists of blacklisted and whitelisted strings. blacklisted_strings = [] for blacklist_file_path in CONFIG['blacklists']: blacklist_file = open(blacklist_file_path, 'r') blacklisted_strings += [line.strip().lower() for line in blacklist_file] whitelisted_strings = [] for whitelist_file_path in CONFIG['whitelists']: whitelist_file = open(whitelist_file_path, 'r') whitelisted_strings += [line.strip().lower() for line in whitelist_file] # For our purposes, we're considering all words in a given blacklisted or whitelisted string to be blacklisted or # whitelisted too.
# Create a UrlFetcher class which will do all the fetching for us (using a cache # to prevent hammering the server) fetcher = UrlFetcher(CONFIG['paths']['cache_dir']) if CONFIG['fetch_mode'] == 'stories': # If `include_mature` was set, Create the `view_mature` cookie to send to # Fimfiction. requests_cookie_jar = None if 'include_mature' in CONFIG and CONFIG['include_mature'] is True: requests_cookie_jar = requests.cookies.RequestsCookieJar() requests_cookie_jar.set('view_mature', 'true', domain='www.fimfiction.net', path='/') # For all requested authors, crawl Fimfiction to find their stories and # obtain a list of story ids. for author in authors: print_stderr('Grabbing stories for author "{}"...'.format(author)) # Fetch the profile page for the given user, and from it, obtain the URL # of their stories page. user_profile_url = CONFIG['base_url']+'/user/'+username_escape(author_username) user_profile_html = fetcher.fetch_url(user_profile_url, cookie_jar=requests_cookie_jar) soup = BeautifulSoup(user_profile_html, 'html.parser') stories_page_link = soup.find(class_='tab-stories').find('a') stories_page_url = CONFIG['base_url']+stories_page_link['href'] stories_page_html = fetcher.fetch_url(stories_page_url, cookie_jar=requests_cookie_jar) author_stories = [] # From the author's stories page, fetch all text download links for all stories on the page. while True: soup = BeautifulSoup(stories_page_html, 'html.parser') # Get all the "chapters footers" (the bar at the bottom of each story card which contains a download
def fetch_page(self, url): print_stderr('Requesting data from {}...'.format(url)) response = self.session.get(url) return response.text