def down_from_remote(host_name, user_name, password, port, remote_dir_name, local_dir_name): if local_dir_name == 'de': current_path = os.getcwd() current_path = current_path + '/' + 'ATtest/' local_dir_name = current_path else: local_dir_name = local_dir_name """遠程下載文件""" t = paramiko.Transport(host_name, port) t.connect(username=user_name, password=password) sftp = paramiko.SFTPClient.from_transport(t) remote_file = sftp.stat(remote_dir_name) if isdir(remote_file.st_mode): # 文件夾,不能直接下載,需要繼續循環 check_local_dir(local_dir_name) print('開始下載資料夾1:' + remote_dir_name) for remote_file_name in sftp.listdir(remote_dir_name): sub_remote = os.path.join(remote_dir_name, remote_file_name) sub_remote = sub_remote.replace('\\', '/') sub_local = os.path.join(local_dir_name, remote_file_name) sub_local = sub_local.replace('\\', '/') down_from_remote(host_name, user_name, password, port, sub_remote, sub_local) else: # 文件,直接下載 print('開始下載文件2:' + remote_dir_name) sftp.get(remote_dir_name, local_dir_name) t.close()
def down_from_remote(sftp_obj, remote_dir_name, local_dir_name): """远程下载文件""" remote_file = sftp_obj.stat(remote_dir_name) if isdir(remote_file.st_mode): # 文件夹,不能直接下载,需要继续循环 check_local_dir(local_dir_name) print('开始下载数据目录:' + remote_dir_name) for remote_file_name in sftp.listdir(remote_dir_name): local_file_name = os.path.join(local_dir, remote_file_name) #转换 : -> _ local_file_name = local_file_name.replace(':', '_') #转换/及\\ sub_remote = os.path.join(remote_dir_name, remote_file_name) sub_remote = sub_remote.replace('\\', '/') sub_local = os.path.join(local_dir_name, remote_file_name) sub_local = sub_local.replace('\\', '/') sub_local = sub_local.replace(':', '_') #如果本地没有远程数据文件夹 if not os.path.exists(local_file_name): down_from_remote(sftp_obj, sub_remote, sub_local) else: # 文件,直接下载 print('开始下载数据文件:' + remote_dir_name) # 如果本地没有远程数据文件 local_file_name = os.path.join(local_dir, remote_dir_name) if not os.path.exists(local_file_name): sftp.get(remote_dir_name, local_dir_name)
def get(sftp,remote,local): #检查远程文件是否存在 try: result = sftp.stat(remote) except IOError as err: error = '[ERROR %s] %s: %s' %(err.errno,os.path.basename(os.path.normpath(remote)),err.strerror) print(error) else: #判断远程文件是否为目录 if isdir(result.st_mode): dirname = os.path.basename(os.path.normpath(remote)) local = os.path.join(local,dirname) _check_local(local) for file in sftp.listdir(remote): sub_remote = os.path.join(remote,file) sub_remote = sub_remote.replace('\\','/') get(sftp,sub_remote,local) else: #拷贝文件 if os.path.isdir(local): local = os.path.join(local,os.path.basename(remote)) try: sftp.get(remote,local) except IOError as err: print(err) else: print('[get]',local,'<==',remote)
def download(self, sftp, remote, local): # 检查远程文件是否存在 try: result = sftp.stat(remote) except IOError as err: error = '[ERROR %s] %s: %s' % ( err.errno, os.path.basename( os.path.normpath(remote)), err.strerror) return {"st": False, "rt": error} else: # 判断远程文件是否为目录 if isdir(result.st_mode): dirname = os.path.basename(os.path.normpath(remote)) local = os.path.join(local, dirname) _check_local(local) for file in sftp.listdir(remote): sub_remote = os.path.join(remote, file) sub_remote = sub_remote.replace('\\', '/') self.download(sftp, sub_remote, local) else: # 拷贝文件 if os.path.isdir(local): local = os.path.join(local, os.path.basename(remote)) try: sftp.get(remote, local) except IOError as err: return {"st": False, "rt": err}
def get(sftp, remote, local): """ function: download file or folder from server param: sftp remote local """ try: result = sftp.stat(remote) except IOError as err: error = '[ERROR %s] %s: %s' % ( err.errno, os.path.basename( os.path.normpath(remote)), err.strerror) print(error) else: if isdir(result.st_mode): dirname = os.path.basename(os.path.normpath(remote)) local = os.path.join(local, dirname) _check_local(local) for file in sftp.listdir(remote): sub_remote = os.path.join(remote, file) sub_remote = sub_remote.replace('\\', '/') get(sftp, sub_remote, local) else: if os.path.isdir(local): local = os.path.join(local, os.path.basename(remote)) try: sftp.get(remote, local) except IOError as err: print(err) else: print('[get]', local, '<==', remote)
def getFilesListInRemoteHost(sftp, remotedir): #保存所有文件名、文件变更时间的列表 file_list = [] filemtime_list = [] #去掉路径字符串最后的字符'/',如果有的话 if remotedir[-1] == '/': remotedir = remotedir[0:-1] #获取当前指定目录下的所有目录及文件,包含属性值 files = sftp.listdir_attr(remotedir) for x in files: #remotedir目录中每一个文件或目录的完整路径 filename = remotedir + '/' + x.filename #如果是目录,则递归处理该目录,stat库中的S_ISDIR方法,与linux中的宏的名字完全一致 if isdir(x.st_mode): file_list_extend, filemtime_list_extend = getFilesListInRemoteHost( sftp, filename) file_list.extend(file_list_extend) filemtime_list.extend(filemtime_list_extend) else: #判断文件类型 file_ext = x.filename.split('.')[-1] if ('gz' == file_ext): file_list.append(filename) #获取文件修改时间,用以确认是否存在更新 filemtime_list.append( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(x.st_mtime))) return file_list, filemtime_list
def downLoad(client, sftp, remote, local): global logger #检查远程文件 try: result = sftp.stat(remote) except IOError as err: error = '[ERROR %s] %s: %s' % ( err.errno, os.path.basename( os.path.normpath(remote)), err.strerror) logger.error(error) else: if isdir(result.st_mode): dirname = os.path.basename(os.path.normpath(remote)) local = os.path.join(local, dirname) #local = local.replace("\\","/") check_local(local) for file in sftp.listdir(remote): sub_remote = os.path.join(remote, file) sub_remote = sub_remote.replace("\\", "/") downLoad(client, sftp, sub_remote, local) else: if os.path.isdir(local): local = os.path.join(local, os.path.basename(remote)) try: sftp.get(remote, local) except IOError as err: logger.error(err) else: logger.info('[get] %s %s %s', remote, '==>', local) lock.acquire() global finish finish += 1 lock.release() logger.info('已下载 [%d] 个文件', finish)
def down_from_remote(sftp_obj, remote_dir_name, local_dir_name): remote_file = sftp_obj.stat(remote_dir_name) if isdir(remote_file.st_mode): check_local_dir(local_dir_name) print('开始下载文件夹:' + remote_dir_name) for remote_file_name in sftp.listdir(remote_dir_name): sub_remote = os.path.join(remote_dir_name, remote_file_name) sub_remote = sub_remote.replace('\\', '/') sub_local = os.path.join(local_dir_name, remote_file_name) sub_local = sub_local.replace('\\', '/') down_from_remote(sftp_obj, sub_remote, sub_local) else: print('开始下载文件:' + remote_dir_name) sftp.get(remote_dir_name, local_dir_name)
def down_from_remote(self, sftp, remote_dir_name, local_dir_name): remote_file = sftp.stat(remote_dir_name) if isdir(remote_file.st_mode): self.check_local_dir(local_dir_name) print('Downloading:' + remote_dir_name) for remote_file_name in sftp.listdir(remote_dir_name): sub_remote = os.path.join(remote_dir_name, remote_file_name) sub_remote = sub_remote.replace('\\', '/') sub_local = os.path.join(local_dir_name, remote_file_name) sub_local = sub_local.replace('\\', '/') self.down_from_remote(sftp, sub_remote, sub_local) else: print('Downloading:' + remote_dir_name) sftp.get(remote_dir_name, local_dir_name)
def download_remote(sftp, remote_path, local_path): """Remote Download File""" remote_file = sftp.stat(remote_path) if isdir(remote_file.st_mode): # Download DIR check_local_dir(local_path) print('Download dir :' + remote_path) for remote_file_name in sftp.listdir(remote_path): sub_remote = os.path.join(remote_path, remote_file_name) sub_remote = sub_remote.replace('\\', '/') sub_local = os.path.join(local_path, remote_file_name) sub_local = sub_local.replace('\\', '/') download_remote(sftp, sub_remote, sub_local) else: # Download FILE print('Download file :' + remote_path) sftp.get(remote_path, local_path)
def downfile(self, sftp, local_dir_name, remote_dir_name='/tmp'): """ 下载远程服务器文件 :return: """ remote_file = sftp.stat(remote_dir_name) # print(remote_file) if isdir(remote_file.st_mode): # 文件夹,不能直接下载,需要继续循环 if not os.path.exists(local_dir_name): os.makedirs(local_dir_name) print('开始下载文件夹:' + remote_dir_name) for remote_file_name in sftp.listdir(remote_dir_name): sub_remote = os.path.join(remote_dir_name, remote_file_name) sub_remote = sub_remote.replace('\\', '/') sub_local = os.path.join(local_dir_name, remote_file_name) sub_local = sub_local.replace('\\', '/') self.downfile(sftp, sub_local, sub_remote) else: # 文件,直接下载 print('开始下载文件:' + remote_dir_name) sftp.get(remote_dir_name, local_dir_name)
def down_from_remote(sftp_obj, remote_dir_path, local_dir_path, filename_at_client): """download from remote""" remote_file = sftp_obj.stat(remote_dir_path) """check if is folder""" if isdir(remote_file.st_mode): #file_list = fetch_folder_name(parent_folder) #for i in file_list #download_from_remote(sftp_obj, i.filePath_Server, i.filePath_Client, i.fileName) print("本地路径:" + local_dir_path) print("文件夹名称:" + filename_at_client) print(os.path.join(local_dir_path, filename_at_client)) local_dir_path = os.path.join(local_dir_path, filename_at_client) check_local_dir(local_dir_path) print('start downloading the folder:' + remote_dir_path) for remote_file_name in sftp_obj.listdir(remote_dir_path): sub_remote = os.path.join(remote_dir_path, remote_file_name) sub_remote = sub_remote.replace('\\', '/') print(remote_file_name) # sub_local = os.path.join(local_dir_path, remote_file_name) # sub_local = sub_local.replace('\\', '/') # down_from_remote(sftp_obj, sub_remote, sub_local, remote_file_name) down_from_remote(sftp_obj, sub_remote, local_dir_path, remote_file_name) """if it is file""" else: check_local_dir(local_dir_path) print('start downloading the file:' + remote_dir_path) sub_remote = remote_dir_path.replace('\\', '/') # print(local_dir_path+"\\"+filename_at_client) local_dir_path = os.path.join(local_dir_path, filename_at_client) # local_dir_path = local_dir_path + "\\" + filename_at_client # local_dir_path = os.path.join(os.path.split(local_dir_path)[0], filename_at_client) local_dir_path = local_dir_path.replace('\\', '/') print(local_dir_path) sftp_obj.get(sub_remote, local_dir_path)
def get(self, sftp, remote, local): def _check_local(local): if not os.path.exists(local): try: os.mkdir(local) except IOError as err: print(err) # check existing for remote file try: result = sftp.stat(remote) except IOError as err: error = '[ERROR %s] %s: %s' % ( err.errno, os.path.basename( os.path.normpath(remote)), err.strerror) print(error) else: if isdir(result.st_mode): # is folder dirname = os.path.basename(os.path.normpath(remote)) local = os.path.join(local, dirname) _check_local(local) for file in sftp.listdir(remote): sub_remote = os.path.join(remote, file) sub_remote = sub_remote.replace('\\', '/') get(sftp, sub_remote, local) else: # is file & copy if os.path.isdir(local): local = os.path.join(local, os.path.basename(remote)) try: sftp.get(remote, local) except IOError as err: print(err) else: print('[get]', local, '<==', remote)
def down_from_remote(sftp_obj, remote_dir_name, local_dir_name): """远程下载文件""" try: remote_file = sftp_obj.stat(remote_dir_name) except: print remote_dir_name + " not exist" else: if isdir(remote_file.st_mode): # 文件夹,不能直接下载,需要继续循环 check_local_dir(local_dir_name) print('开始下载文件夹:' + remote_dir_name) for remote_file_name in sftp_obj.listdir(remote_dir_name): sub_remote = os.path.join(remote_dir_name, remote_file_name) # print(sub_remote) if str(sub_remote).startswith("/tmp") and str( sub_remote).endswith(".java"): sub_remote = sub_remote.replace('\\', '/') sub_local = os.path.join(local_dir_name, remote_file_name) sub_local = sub_local.replace('\\', '/') down_from_remote(sftp_obj, sub_remote, sub_local) else: # 文件,直接下载 print('开始下载文件:' + remote_dir_name) sftp_obj.get(remote_dir_name, local_dir_name)
def _fast_update_database(self, engine, args): """Update all data contained in the given engine quickly, see --fast @return number of processed records""" nr = 0 st = time() log = self.log() progress_every = 5000 stats_info_every = 500 commit_every_seconds = 30 commit_every_records = 15000 time_of_last_commit = time() connection = engine.connect() meta = MetaData(engine, reflect=True) fsitem = meta.tables[args.table_name] insert = fsitem.insert() update = ( fsitem.update() .where(fsitem.c.id == bindparam("rid")) .values( path=bindparam("path"), size=bindparam("size"), atime=bindparam("atime"), ctime=bindparam("ctime"), mtime=bindparam("mtime"), uid=bindparam("uid"), gid=bindparam("gid"), nblocks=bindparam("nblocks"), nlink=bindparam("nlink"), mode=bindparam("mode"), ldest=bindparam("ldest"), sha1=bindparam("sha1"), ratio=bindparam("ratio"), ) ) # NOTE: this selector assures we only get the latest version of a file, based on the modification time ! selector = select( [ fsitem.c.id, fsitem.c.path, fsitem.c.size, fsitem.c.atime, fsitem.c.ctime, # marker to see if something is deleted fsitem.c.mtime, fsitem.c.uid, fsitem.c.gid, fsitem.c.nblocks, fsitem.c.nlink, fsitem.c.mode, fsitem.c.ldest, fsitem.c.sha1, fsitem.c.ratio, ], order_by=[fsitem.c.path, fsitem.c.id.desc()], ) if args.where_like: selector = selector.where(fsitem.c.path.like(args.where_like + "%")) # end append where clause def progress(): elapsed = time() - st log.info("Checked %i files in %.2fs (%.2f files/s)", nr, elapsed, nr / elapsed) # end join = os.path.join isabs = os.path.isabs dirname = os.path.dirname basename = os.path.basename streamer = HashStreamer(hashlib.sha1, lz4dumps) ## A mapping from directory names to all of its files (as names) dir_entries = dict() # A list of sql operators that will update particular entries. They are executed all at once # Must include the ID updates = list() total_num_updates = 0 modified_count = 0 added_count = 0 deleted_count = 0 last_path = None # The window is critical - it is slow for the server, and each query is like a new complete query # where only a subset is sent (due to the ordering) # Additionally, if there are many changes, we will change the database during iteration, which will # basically give us part of the same files (if not the same files) back on the next query, which # makes us even more inefficient. Therefore we use memory to our advantage, and use 1mio entries # by default. This needs about 1GB of memory, but reduces the amount of queries considerably # especially on large database window = 1000 * 1000 cur_window = 0 shortest_path = None len_shortest_path = 100000000 for cursor in self._fetch_record_iterator(connection, selector, window): nri = 0 # num rows in iteration for row in cursor: # NOTE: We are getting multiple entries, sorted by the latest one, for the same path # We prune all paths of a kind have seen so far # Can be files or directories nri += 1 nr += 1 rid, path, size, atime, ctime, mtime, uid, gid, nblocks, nlink, mode, ldest, sha1, ratio = row if not isabs(path) or path == last_path: continue # end skip relative paths ! last_path = path ascii_path = to_ascii(path) # NOTE: I know, this is killing us, as we will grow rather large by keeping all that data # But I know no other way except for processing directories while we are going. # As files and directories will be mixed, it is not too easy though to figure this out. # For now, we just go for it and let the CPU/Memory burn directory = dirname(path) if directory not in dir_entries: dir_entries[directory] = set() # end count dirs dir_entries[directory].add(basename(path)) # Make sure we don't forget to set the actual directory - otherwise if isdir(mode): dir_entries.setdefault(path, set()) # end add each directory that is a directory # Find the root path, which should be the origin of it all, and ignore it when # finding added items. It's definitely the shortest one if len(directory) < len_shortest_path: shortest_path = directory len_shortest_path = len(directory) # end keep shortest path try: # For some reason, this doesn't get our unicode as it tries to use ascii to deal with it # NOTE: We could know the file was deleted by checking fsitem.c.ctime is None, but # we check anyway because it could be re-created. stat = lstat(ascii_path) except OSError: # DELETION ########## # This marks a deletion - we just keep the time of deletion, which is the time when we # noticed it ! Not the actual one # It didn't exist, but only append this info if we didn't know about that before if ctime is not None: # have to write an entire record, otherwise changes and deletions go out of sync updates.append( { "rid": rid, "path": path, "size": 0, "atime": atime, "ctime": None, "mtime": seconds_to_datetime(time()), "uid": uid, "gid": gid, "nblocks": nblocks, "nlink": nlink, "mode": mode, "ldest": ldest, # Keep sha as last known contents ! This allows to track deletion even # renames and deletions "sha1": sha1, "ratio": ratio, } ) deleted_count += 1 if deleted_count % stats_info_every == 0: log.info("Found %i DELETED paths", deleted_count) # end handle deleted # end handle deletions else: # MODIFICATION ############### # File could have been deleted and re-created # We don't know it was an addition (due to previous deletion), but the dataset is the same # so people can figure it out later # ordered by likeliness if ( seconds_to_datetime(stat.st_mtime) != mtime or size != stat.st_size or uid != stat.st_uid or gid != stat.st_gid or mode != stat.st_mode or nlink != stat.st_nlink or (islink(stat.st_mode) and readlink(ascii_path) != ldest) ): # NOTE: we are lazy here and say, for now, that the size must change to justify # taking another sha. Otherwise we assume that it's just any other change, which we will # put into the database in the form of a new commit, of course. if self._append_path_record( updates, path, streamer, log, stat, size == stat.st_size and (sha1, ratio) or None ): # add the rid to have everything we need for the update updates[-1]["rid"] = rid modified_count += 1 if modified_count % stats_info_every == 0: log.info("Found %i MODIFIED paths", modified_count) # end show information # end handle modification # end handle modification # end handle deleted file if nr % progress_every == 0: progress() # end handle progress if len(updates) >= commit_every_records or time() - time_of_last_commit >= commit_every_seconds: total_num_updates += len(updates) self.do_execute_records(connection, update, updates, log, st, total_num_updates) time_of_last_commit = time() # end handle executions # end for each file in database windows cursor.close() # Is the database depleted ? if nri < window: break # end handle window # end for each cursor progress() total_num_updates += len(updates) self.do_execute_records(connection, update, updates, log, st, total_num_updates) ######################## # HANDLE ADDITIONS ### #################### # We iterate all actual directories and their entries as known to the database # Now we just have to compare and only check for additions new_records = list() def list_dir_safely(dir_ascii): """@return entries of an empty tuple() if the listing failed""" try: return os.listdir(dir_ascii) except OSError: # ignore added dirs which might already be gone log.warn("Couldn't access '%s' when trying to add it", dir_ascii) return tuple() # end handle exception # We can't assign a variable in an outside scope, so we have to make it an array last_commit_time = [time()] def append_records_recursive(path, added_count): """Find all entries recursively in path and append them @param path directory or path @return amount of added items""" # no matter what, add the entry if self._append_path_record(new_records, path, streamer, log): added_count += 1 if added_count % stats_info_every == 0: log.info("Found %i ADDED paths", added_count) # end info printing if len(new_records) >= commit_every_records or time() - last_commit_time[0] >= commit_every_seconds: self.do_execute_records(connection, insert, new_records, log, st, added_count) last_commit_time[0] = time() # end handle path path_ascii = to_ascii(path) if os.path.isdir(path_ascii): entries = list_dir_safely(path_ascii) for entry in entries: added_count = append_records_recursive(join(path, entry), added_count) # end for each entry to check # end entries return added_count # end recursion helper # Remove shortest directory, which was generated from the directory of our root ! # NOTE: if there was no root, this is false alarm try: del (dir_entries[shortest_path]) except KeyError: pass # end ignore root not in dirlist log.info("About to check %i directories for added entries ...", len(dir_entries)) for dir, entries in dir_entries.iteritems(): added = set(list_dir_safely(to_ascii(dir))) - entries for added_entry in added: added_count = append_records_recursive(join(dir, added_entry), added_count) # end for each directory to check if new_records: log.info("Committing remaining %i new records", len(new_records)) self.do_execute_records(connection, insert, new_records, log, st, added_count) # end commit new records connection.close() elapsed = time() - st log.info("== Statistics ==") log.info("%5i ADDED", added_count) log.info("%5i MODIFIED", modified_count) log.info("%5i DELETED", deleted_count) log.info("================") log.info( "Updated %i entries in %.2fs (%.2f entries/s)", total_num_updates, elapsed, total_num_updates / elapsed ) return nr