def remap_filepath(old_path, new_filepath): """ Called if a better version of a file is found, this updates them all to the new location. """ old_path = stringutil.normalize_file(old_path) new_filepath = stringutil.normalize_file(new_filepath) with lock('w'), closing(conn.cursor()) as cur: #!cover cur.execute('UPDATE urls SET file_path=:nfp WHERE file_path = :ofp', { 'nfp': new_filepath, 'ofp': old_path }) conn.commit()
def process_url(self, url, info): """ Accepts a URL and the array of file info generated for it by this class, and then attempts to download it using any possible handler. Returns whatever the handlers do, which should be a path to the file itself or the containing directory for an album. +Also returns False or None if no appropriate handler was found, or if the handler told us not to download anything. """ ret_val = False # Default to 'False', meaning no file was located by a handler. for h in self.handlers: self.log.out( 1, stringutil.color("Checking handler: %s" % h.tag, stringutil.Fore.CYAN)) ret = False # noinspection PyBroadException try: ret = h.handle(url, info, self.handler_log) except Exception: # There are too many possible exceptions between all handlers to catch properly. #print(sys.exc_info()[0]) raise # TODO: Report and stop thread, probably. I want to see errors reported. if ret is None: #!cover # None is returned when the handler specifically wants this URL to be "finished", but not added to the files list. ret_val = None break if ret: # The handler will return a file/directory name if it worked properly. ret_val = stringutil.normalize_file(ret) break return ret_val
def create(file, base_dir=None): global conn with lock('w'): if base_dir is not None: file = stringutil.normalize_file(base_dir + '/' + file) build = file == ':memory:' or not os.path.isfile(file) if not os.path.isdir(base_dir): os.makedirs(base_dir) conn = sqlite3.connect(file, check_same_thread=False) if build: with closing(conn.cursor()) as cur: cur.execute('''CREATE TABLE posts ( id text PRIMARY KEY, author text, source_alias text, subreddit text, title text, type text )''') cur.execute('''CREATE TABLE urls ( post_id text, url text, file_path text )''') cur.execute('''CREATE TABLE hashes ( file_path text PRIMARY KEY, lastmtime int, hash text )''') cur.execute('''CREATE TABLE metadata ( meta_key text PRIMARY KEY, meta_val text )''') conn.commit() with closing(conn.cursor()) as cur: cur.execute('INSERT INTO metadata VALUES (?,?)', ('version', '1.0')) cur.execute('INSERT INTO metadata VALUES (?,?)', ('author', 'ShadowMoose')) cur.execute('INSERT INTO metadata VALUES (?,?)', ('website', 'https://goo.gl/hgBxN4')) conn.commit() print("Built DB.") print('Connected to DB.')
def put_file_hash(f_path, f_hash, f_lastmtime): """ Adds the given hash data for the given filename. """ f_path = stringutil.normalize_file(f_path) with lock('w'), closing(conn.cursor()) as cur: cur.execute( 'INSERT OR REPLACE INTO hashes (file_path, lastmtime, hash) VALUES (?,?,?)', (f_path, f_lastmtime, f_hash)) conn.commit()
def process_ele(self, reddit_element): """ Accepts a RedditElement of Post/Comment details, then runs through the Handlers loaded from the other directory, attempting to download the url. """ self.log.out(0, 'Processing new ele...') self.handler_log.clear() #print('\n\n\nProcessing ele: %s' % reddit_element.to_obj()) self.log.out( 0, stringutil.out( "[%s](%s): %s" % (reddit_element.type, reddit_element.subreddit, reddit_element.title), False, stringutil.Fore.LIGHTYELLOW_EX)) for url in reddit_element.get_urls(): #print('Handling URL: %s' % url) url_info = manifest.get_url_info(url) #print('URL Info:', url_info) if url_info: file = url_info['file_path'] if file and os.path.exists(file): #print('URL already handled.') reddit_element.add_file(url, file) hashjar.add_hash( file ) # Update hash, just in case it doesn't have this file. continue # This URL hasn't been handled yet! Time to download it: file_info = self.build_file_info( reddit_element ) # Build the file information dict using this RedditElement's information if file_info is None: reddit_element.add_file( url, False ) # This mostly happens if the filename can't be generated. else: # Download file from new url, using the loaded Handlers: file_path = self.process_url( url, file_info ) # The important bit is here, & doesn't need the Lock. if file_path: file_path = stringutil.normalize_file( file_path) # Normalize for all DB storage. if not self.keep_running: return # Kill the thread after a potentially long-running download if the program has terminated. !cover reddit_element.add_file(url, self.check_duplicates(file_path)) manifest.insert_post( reddit_element) # Update Manifest with completed ele. with HandlerThread.ele_lock: # Clear blacklisted filename list, just to release the memory. for r in self.release_filenames: HandlerThread.used_files.remove(r) self.release_filenames = []
def add_hash(filename): """ Add the given file to the Hash jar. :param filename: The path to the file to add. :return: ([Is New File], existing_file_path) """ if filename: filename = stringutil.normalize_file(filename) # Normalize for safety. if not filename or not os.path.exists(filename) or os.path.isdir(filename): # Skip directories. return True, None pre = manifest.get_file_hash( filename ) # Start with a simple lookup to see if this path's hash is stored already. lmt = os.path.getmtime(filename) if pre: if lmt == pre['lastmtime']: # Hash already exists and file hasn't changed since its last storage. return False, filename _, final_hash = _get_best_hash( filename ) # If we didn't find the hash, or this file has been modified, re-hash. if not final_hash: #!cover stringutil.error( "HashCheck :: Error hit hashing file, passing file as new.") return True, None manifest.put_file_hash(filename, final_hash, lmt) # Store the hash of every file processed. # NOTE: Now that this file is stored, it's up to anything that deletes an archived file to also remove the hash. _it = manifest.hash_iterator(len(final_hash)) for h in _it: if h['file_path'] == filename: continue # Since we've just added this file's hash, we don't want to match with it! dist = _hamming_distance(h['hash'], final_hash) if dist < 4: #print('\tHashCheck :: Distance matches existing file (%s,%s): %s' % (final_hash, h, dist)) _it.send(True) # Release DB generator. return False, h['file_path'] #print('\tHashCheck :: File is unique. Saved successfully.') return True, None
def build_file_info(self, reddit_element): """ Generates a dict of file locations and element data that is passed down to every handler, so they can choose where best to save for themselves. """ with HandlerThread.ele_lock: dir_pattern = '%s/%s' % (self.settings.save_base(), self.settings.save_subdir()) file_pattern = '%s/%s' % (dir_pattern, self.settings.save_filename()) basedir = stringutil.insert_vars(dir_pattern, reddit_element) basefile = stringutil.insert_vars(file_pattern, reddit_element) if basedir is None or basefile is None: #Cannot download this file, because the file path generated for it is too long return None #!cover og = basefile i = 2 while basefile in HandlerThread.used_files: #Use local list of filenames used here, since used filenames won't be updated until done otherwise. basefile = og + ' . ' + str(i) basefile = stringutil.normalize_file(basefile) i += 1 HandlerThread.used_files.append( basefile) # blacklist this base name while we download. self.release_filenames.append(basefile) # Build an array of pre-generated possible locations & important data for handlers to have access to. return { 'parent_dir': basedir, # Some handlers will need to build the parent directory for their single file first. This simplifies parsing. 'single_file': basefile + "%s", # If this handler can output a single file, it will use this path. 'multi_dir': basefile + "/", # If the handler is going to download multiple files, it will save them under this directory. 'post_title': reddit_element.title, # The title of the Reddit post. 'post_subreddit': reddit_element.subreddit, # The subreddit this post came from. 'user_agent': self.settings.get('auth', None)['user_agent'], }
def remove_file_hash(f_path): """ Remove any hashes for the given path. """ f_path = stringutil.normalize_file(f_path) with lock('w'), closing(conn.cursor()) as cur: cur.execute('DELETE FROM hashes WHERE file_path=:fp', {'fp': f_path}) conn.commit()
def get_file_hash(f_path): """ Returns a dictionary of the given Hash info for the file, or None. """ f_path = stringutil.normalize_file(f_path) return _select_fancy('hashes', ['lastmtime', 'hash'], 'file_path = :fname', {'fname': f_path})