def list(self, directory=''): ''' List FTP directory :return: tuple of file and dirs in current directory with details ''' logging.debug('Download:List:' + self.url + self.rootdir + directory) #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory) try: self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory) except Exception as a: self.crl.setopt(pycurl.URL, (self.url + self.rootdir + directory).encode( 'ascii', 'ignore')) if self.proxy is not None: self.crl.setopt(pycurl.PROXY, self.proxy) if self.proxy_auth is not None: self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth) if self.credentials is not None: self.crl.setopt(pycurl.USERPWD, self.credentials) output = BytesIO() # lets assign this buffer to pycurl object self.crl.setopt(pycurl.WRITEFUNCTION, output.write) self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function) self.crl.perform() # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in self.headers: content_type = self.headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'iso-8859-1' # lets get the output in a string result = output.getvalue().decode(encoding) ''' 'http.parse.dir.line': r'<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})', 'http.parse.file.line': r'<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})', 'http.group.dir.name': 1, 'http.group.dir.date': 2, 'http.group.file.name': 1, 'http.group.file.date': 2, 'http.group.file.size': 3, ''' rfiles = [] rdirs = [] dirs = re.findall(self.config.get('http.parse.dir.line'), result) if dirs is not None and len(dirs) > 0: for founddir in dirs: rfile = {} rfile['permissions'] = '' rfile['group'] = '' rfile['user'] = '' rfile['size'] = '0' date = founddir[int(self.config.get('http.group.dir.date')) - 1] dirdate = date.split() parts = dirdate[0].split('-') #19-Jul-2014 13:02 rfile['month'] = Utils.month_to_num(parts[1]) rfile['day'] = parts[0] rfile['year'] = parts[2] rfile['name'] = founddir[ int(self.config.get('http.group.dir.name')) - 1] rdirs.append(rfile) files = re.findall(self.config.get('http.parse.file.line'), result) if files is not None and len(files) > 0: for foundfile in files: rfile = {} rfile['permissions'] = '' rfile['group'] = '' rfile['user'] = '' rfile['size'] = foundfile[ int(self.config.get('http.group.file.size')) - 1] date = foundfile[int(self.config.get('http.group.file.date')) - 1] if self.config.get('http.parse.file.date.format'): date_object = datetime.datetime.strptime( date, self.config.get('http.parse.file.date.format').replace( '%%', '%')) rfile['month'] = date_object.month rfile['day'] = date_object.day rfile['year'] = date_object.year else: dirdate = date.split() parts = dirdate[0].split('-') #19-Jul-2014 13:02 rfile['month'] = Utils.month_to_num(parts[1]) rfile['day'] = parts[0] rfile['year'] = parts[2] rfile['name'] = foundfile[ int(self.config.get('http.group.file.name')) - 1] filehash = (rfile['name'] + str(date) + str(rfile['size'])).encode('utf-8') rfile['hash'] = hashlib.md5(filehash).hexdigest() rfiles.append(rfile) print("###OSALLOU " + str(rfile)) return (rfiles, rdirs)
def list(self, directory=''): ''' List FTP directory :return: tuple of file and dirs in current directory with details ''' logging.debug('Download:List:' + self.url + self.rootdir + directory) #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory) try: self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory) except Exception as a: self.crl.setopt(pycurl.URL, (self.url + self.rootdir + directory).encode( 'ascii', 'ignore')) if self.proxy is not None: self.crl.setopt(pycurl.PROXY, self.proxy) if self.proxy_auth is not None: self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth) if self.credentials is not None: self.crl.setopt(pycurl.USERPWD, self.credentials) output = BytesIO() # lets assign this buffer to pycurl object self.crl.setopt(pycurl.WRITEFUNCTION, output.write) self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function) self.crl.setopt(pycurl.CONNECTTIMEOUT, 300) # Download should not take more than 5minutes self.crl.setopt(pycurl.TIMEOUT, self.timeout) self.crl.setopt(pycurl.NOSIGNAL, 1) try: self.crl.perform() except Exception as e: logging.error('Could not get errcode:' + str(e)) # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in self.headers: content_type = self.headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'iso-8859-1' # lets get the output in a string result = output.getvalue().decode(encoding) # FTP LIST output is separated by \r\n # lets split the output in lines #lines = result.split(r'[\r\n]+') lines = re.split(r'[\n\r]+', result) # lets walk through each line rfiles = [] rdirs = [] for line in lines: rfile = {} # lets print each part separately parts = line.split() # the individual fields in this list of parts if not parts: continue rfile['permissions'] = parts[0] rfile['group'] = parts[2] rfile['user'] = parts[3] rfile['size'] = parts[4] rfile['month'] = Utils.month_to_num(parts[5]) rfile['day'] = parts[6] rfile['hash'] = hashlib.md5(line.encode('utf-8')).hexdigest() try: rfile['year'] = int(parts[7]) except Exception as e: # specific ftp case issues at getting date info curdate = datetime.now() rfile['year'] = curdate.year # Year not precised, month feater than current means previous year if rfile['month'] > curdate.month: rfile['year'] = curdate.year - 1 # Same month but later day => previous year if rfile['month'] == curdate.month and int( rfile['day']) > curdate.day: rfile['year'] = curdate.year - 1 rfile['name'] = parts[8] if len(parts) >= 10 and parts[9] == '->': # Symlink, add to files AND dirs as we don't know the type of the link rdirs.append(rfile) is_dir = False if re.match('^d', rfile['permissions']): is_dir = True if not is_dir: rfiles.append(rfile) else: rdirs.append(rfile) return (rfiles, rdirs)
def list(self, directory=''): ''' List FTP directory :return: tuple of file and dirs in current directory with details ''' logging.debug('Download:List:'+self.url+self.rootdir+directory) #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory) try: self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory) except Exception as a: self.crl.setopt(pycurl.URL, (self.url+self.rootdir+directory).encode('ascii','ignore')) if self.proxy is not None: self.crl.setopt(pycurl.PROXY, self.proxy) if self.proxy_auth is not None: curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth) if self.credentials is not None: self.crl.setopt(pycurl.USERPWD, self.credentials) output = BytesIO() # lets assign this buffer to pycurl object self.crl.setopt(pycurl.WRITEFUNCTION, output.write) self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function) self.crl.perform() # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in self.headers: content_type = self.headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'iso-8859-1' # lets get the output in a string result = output.getvalue().decode(encoding) ''' 'http.parse.dir.line': r'<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})', 'http.parse.file.line': r'<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})', 'http.group.dir.name': 1, 'http.group.dir.date': 2, 'http.group.file.name': 1, 'http.group.file.date': 2, 'http.group.file.size': 3, ''' rfiles = [] rdirs = [] dirs = re.findall(self.config.get('http.parse.dir.line'), result) if dirs is not None and len(dirs)>0: for dir in dirs: rfile = {} rfile['permissions'] = '' rfile['group'] = '' rfile['user'] = '' rfile['size'] = '0' date = dir[int(self.config.get('http.group.dir.date'))-1] dirdate = date.split() parts = dirdate[0].split('-') #19-Jul-2014 13:02 rfile['month'] = Utils.month_to_num(parts[1]) rfile['day'] = parts[0] rfile['year'] = parts[2] rfile['name'] = dir[int(self.config.get('http.group.dir.name'))-1] rdirs.append(rfile) files = re.findall(self.config.get('http.parse.file.line'), result) if files is not None and len(files)>0: for file in files: rfile = {} rfile['permissions'] = '' rfile['group'] = '' rfile['user'] = '' rfile['size'] = file[int(self.config.get('http.group.file.size'))-1] date = file[int(self.config.get('http.group.file.date'))-1] dirdate = date.split() parts = dirdate[0].split('-') #19-Jul-2014 13:02 rfile['month'] = Utils.month_to_num(parts[1]) rfile['day'] = parts[0] rfile['year'] = parts[2] rfile['name'] = file[int(self.config.get('http.group.file.name'))-1] rfiles.append(rfile) return (rfiles, rdirs)
def list(self, directory=''): ''' List FTP directory :return: tuple of file and dirs in current directory with details ''' logging.debug('Download:List:'+self.url+self.rootdir+directory) #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory) try: self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory) except Exception as a: self.crl.setopt(pycurl.URL, (self.url+self.rootdir+directory).encode('ascii', 'ignore')) if self.proxy is not None: self.crl.setopt(pycurl.PROXY, self.proxy) if self.proxy_auth is not None: self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth) if self.credentials is not None: self.crl.setopt(pycurl.USERPWD, self.credentials) output = BytesIO() # lets assign this buffer to pycurl object self.crl.setopt(pycurl.WRITEFUNCTION, output.write) self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function) self.crl.perform() # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in self.headers: content_type = self.headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'iso-8859-1' # lets get the output in a string result = output.getvalue().decode(encoding) # FTP LIST output is separated by \r\n # lets split the output in lines #lines = result.split(r'[\r\n]+') lines = re.split(r'[\n\r]+', result) # lets walk through each line rfiles = [] rdirs = [] for line in lines: rfile = {} # lets print each part separately parts = line.split() # the individual fields in this list of parts if not parts: continue rfile['permissions'] = parts[0] rfile['group'] = parts[2] rfile['user'] = parts[3] rfile['size'] = parts[4] rfile['month'] = Utils.month_to_num(parts[5]) rfile['day'] = parts[6] try: rfile['year'] = int(parts[7]) except Exception as e: # specific ftp case issues at getting date info curdate = datetime.now() rfile['year'] = curdate.year # Year not precised, month feater than current means previous year if rfile['month'] > curdate.month: rfile['year'] = curdate.year - 1 # Same month but later day => previous year if rfile['month'] == curdate.month and int(rfile['day']) > curdate.day: rfile['year'] = curdate.year - 1 rfile['name'] = parts[8] if len(parts) >= 10 and parts[9] == '->': # Symlink, add to files AND dirs as we don't know the type of the link rdirs.append(rfile) is_dir = False if re.match('^d', rfile['permissions']): is_dir = True if not is_dir: rfiles.append(rfile) else: rdirs.append(rfile) return (rfiles, rdirs)
def list(self, directory=''): ''' Try to get file headers to get last_modification and size ''' for file in self.files_to_download: self.crl.setopt(pycurl.HEADER, True) if self.credentials is not None: self.crl.setopt(pycurl.USERPWD, self.credentials) if self.proxy is not None: self.crl.setopt(pycurl.PROXY, self.proxy) if self.proxy_auth is not None: curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth) self.crl.setopt(pycurl.NOBODY, True) try: self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name']) except Exception as a: self.crl.setopt(pycurl.URL, (self.url+self.rootdir+file['name']).encode('ascii','ignore')) #self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name']) output = BytesIO() # lets assign this buffer to pycurl object self.crl.setopt(pycurl.WRITEFUNCTION, output.write) self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function) self.crl.perform() # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in self.headers: content_type = self.headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'iso-8859-1' # lets get the output in a string result = output.getvalue().decode(encoding) lines = re.split(r'[\n\r]+', result) for line in lines: parts = line.split(':') if parts[0].strip() == 'Content-Length': file['size'] = parts[1].strip() if parts[0].strip() == 'Last-Modified': # Sun, 06 Nov 1994 res = re.match('(\w+),\s+(\d+)\s+(\w+)\s+(\d+)', parts[1].strip()) if res: file['day'] = res.group(2) file['month'] = Utils.month_to_num(res.group(3)) file['year'] = res.group(4) continue #Sunday, 06-Nov-94 res = re.match('(\w+),\s+(\d+)-(\w+)-(\d+)', parts[1].strip()) if res: file['day'] = res.group(2) file['month'] = Utils.month_to_num(res.group(3)) file['year'] = str(2000 + int(res.group(4))) continue #Sun Nov 6 08:49:37 1994 res = re.match('(\w+)\s+(\w+)\s+(\d+)\s+\d{2}:\d{2}:\d{2}\s+(\d+)', parts[1].strip()) if res: file['day'] = res.group(3) file['month'] = Utils.month_to_num(res.group(2)) file['year'] = res.group(4) continue return (self.files_to_download,[])
def list(self, directory=''): ''' Try to get file headers to get last_modification and size ''' for file in self.files_to_download: self.crl.setopt(pycurl.HEADER, True) if self.credentials is not None: self.crl.setopt(pycurl.USERPWD, self.credentials) if self.proxy is not None: self.crl.setopt(pycurl.PROXY, self.proxy) if self.proxy_auth is not None: self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth) self.crl.setopt(pycurl.NOBODY, True) try: self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name']) except Exception as a: self.crl.setopt(pycurl.URL, (self.url+self.rootdir+file['name']).encode('ascii', 'ignore')) #self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name']) output = BytesIO() # lets assign this buffer to pycurl object self.crl.setopt(pycurl.WRITEFUNCTION, output.write) self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function) self.crl.perform() # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in self.headers: content_type = self.headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'iso-8859-1' # lets get the output in a string result = output.getvalue().decode(encoding) lines = re.split(r'[\n\r]+', result) for line in lines: parts = line.split(':') if parts[0].strip() == 'Content-Length': file['size'] = parts[1].strip() if parts[0].strip() == 'Last-Modified': # Sun, 06 Nov 1994 res = re.match('(\w+),\s+(\d+)\s+(\w+)\s+(\d+)', parts[1].strip()) if res: file['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest() file['day'] = res.group(2) file['month'] = Utils.month_to_num(res.group(3)) file['year'] = res.group(4) continue #Sunday, 06-Nov-94 res = re.match('(\w+),\s+(\d+)-(\w+)-(\d+)', parts[1].strip()) if res: file['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest() file['day'] = res.group(2) file['month'] = Utils.month_to_num(res.group(3)) file['year'] = str(2000 + int(res.group(4))) continue #Sun Nov 6 08:49:37 1994 res = re.match('(\w+)\s+(\w+)\s+(\d+)\s+\d{2}:\d{2}:\d{2}\s+(\d+)', parts[1].strip()) if res: file['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest() file['day'] = res.group(3) file['month'] = Utils.month_to_num(res.group(2)) file['year'] = res.group(4) continue return (self.files_to_download, [])