def test_copy(self): from_dir = os.path.dirname(os.path.realpath(__file__)) local_file = 'biomaj_tests.py' files_to_copy = [ {'root': from_dir, 'name': local_file}] to_dir = self.utils.data_dir Utils.copy_files(files_to_copy, to_dir) self.assertTrue(os.path.exists(to_dir+'/biomaj_tests.py'))
def test_copy(self): from_dir = os.path.dirname(os.path.realpath(__file__)) local_file = "biomaj_tests.py" files_to_copy = [{"root": from_dir, "name": local_file}] to_dir = self.utils.data_dir Utils.copy_files(files_to_copy, to_dir) self.assertTrue(os.path.exists(to_dir + "/biomaj_tests.py"))
def test_uncompress(self): from_file = {"root": os.path.dirname(os.path.realpath(__file__)), "name": "bank/test.fasta.gz"} to_dir = self.utils.data_dir Utils.copy_files([from_file], to_dir) Utils.uncompress(os.path.join(to_dir, from_file["name"])) self.assertTrue(os.path.exists(to_dir + "/bank/test.fasta"))
def test_copy(self): from_dir = os.path.dirname(os.path.realpath(__file__)) local_file = 'biomaj_tests.py' files_to_copy = [{'root': from_dir, 'name': local_file}] to_dir = self.utils.data_dir Utils.copy_files(files_to_copy, to_dir) self.assertTrue(os.path.exists(to_dir + '/biomaj_tests.py'))
def test_uncompress(self): from_file = { 'root': os.path.dirname(os.path.realpath(__file__)), 'name': 'bank/test.fasta.gz' } to_dir = self.utils.data_dir Utils.copy_files([from_file], to_dir) Utils.uncompress(os.path.join(to_dir, from_file['name'])) self.assertTrue(os.path.exists(to_dir+'/bank/test.fasta'))
def test_uncompress(self): from_file = { 'root': os.path.dirname(os.path.realpath(__file__)), 'name': 'bank/test.fasta.gz' } to_dir = self.utils.data_dir Utils.copy_files([from_file], to_dir) Utils.uncompress(os.path.join(to_dir, from_file['name'])) self.assertTrue(os.path.exists(to_dir + '/bank/test.fasta'))
def wf_uncompress(self): ''' Uncompress files if archives and no.extract = false ''' logging.info('Workflow:wf_uncompress') no_extract = self.session.config.get('no.extract') if no_extract is None or no_extract == 'false': for file in self.downloaded_files: if 'save_as' not in file: file['save_as'] = file['name'] Utils.uncompress(self.session.get_offline_directory() + '/' + file['save_as']) return True
def download(self, local_dir): ''' Copy local files to local_dir :param local_dir: Directory where files should be copied :type local_dir: str :return: list of downloaded files ''' logging.debug('Local:Download') Utils.copy_files(self.files_to_download, local_dir, lock=self.mkdir_lock) return self.files_to_download
def test_get_more_recent_file(self): files = [{ 'name': '/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10 }, { 'name': '/test2', 'year': '2013', 'month': '11', 'day': '12', 'size': 10 }, { 'name': '/test/test1', 'year': '1988', 'month': '11', 'day': '10', 'size': 10 }, { 'name': '/test/test11', 'year': '2013', 'month': '9', 'day': '23', 'size': 10 }] release = Utils.get_more_recent_file(files) self.assertTrue(release['year'] == '2013') self.assertTrue(release['month'] == '11') self.assertTrue(release['day'] == '12')
def test_get_more_recent_file(self): files = [ {"name": "/test1", "year": "2013", "month": "11", "day": "10", "size": 10}, {"name": "/test2", "year": "2013", "month": "11", "day": "12", "size": 10}, {"name": "/test/test1", "year": "1988", "month": "11", "day": "10", "size": 10}, {"name": "/test/test11", "year": "2013", "month": "9", "day": "23", "size": 10}, ] release = Utils.get_more_recent_file(files) self.assertTrue(release["year"] == "2013") self.assertTrue(release["month"] == "11") self.assertTrue(release["day"] == "12")
def test_get_more_recent_file(self): files = [ {'name':'/test1', 'year': '2013', 'month': '11', 'day': '10', 'size': 10}, {'name':'/test2', 'year': '2013', 'month': '11', 'day': '12', 'size': 10}, {'name':'/test/test1', 'year': '1988', 'month': '11', 'day': '10', 'size': 10}, {'name':'/test/test11', 'year': '2013', 'month': '9', 'day': '23', 'size': 10} ] release = Utils.get_more_recent_file(files) self.assertTrue(release['year']=='2013') self.assertTrue(release['month']=='11') self.assertTrue(release['day']=='12')
def wf_stats(self): ''' Get some stats from current release data dir ''' logging.info('Workflow:wf_stats') do_stats = self.bank.config.get('data.stats') if do_stats is None or do_stats == '0': self.session.set('fullsize', 0) return True prod_dir = self.session.get_full_release_directory() dir_size = Utils.get_folder_size(prod_dir) self.session.set('fullsize', dir_size) return True
def wf_stats(self): """ Get some stats from current release data dir """ logging.info("Workflow:wf_stats") do_stats = self.bank.config.get("data.stats") if do_stats is None or do_stats == "0": self.session.set("fullsize", 0) return True prod_dir = self.session.get_full_release_directory() dir_size = Utils.get_folder_size(prod_dir) self.session.set("fullsize", dir_size) return True
def wf_copy(self): ''' Copy files from offline directory to release directory ''' logging.info('Workflow:wf_copy') from_dir = os.path.join(self.session.config.get('data.dir'), self.session.config.get('offline.dir.name')) regexp = self.session.config.get('local.files').split() to_dir = os.path.join(self.session.config.get('data.dir'), self.session.config.get('dir.version'), self.session.get_release_directory(), 'flat') local_files = Utils.copy_files_with_regexp(from_dir,to_dir,regexp, True) self.session._session['files'] = local_files if len(self.session._session['files']) == 0: logging.error('Workflow:wf_copy:No file match in offline dir') return False return True
def main(): parser = argparse.ArgumentParser() parser.add_argument('-s', '--scan', dest="directory", help="Directory to scan") parser.add_argument('--type', dest="ftype", help="Files type") parser.add_argument( '--tags', dest="tags", action="append", default=[], help="tags, format key:value, can be repeated multiple times") args = parser.parse_args() if not os.path.exists(args.directory): sys.exit(1) res = {} for (path, dirs, files) in os.walk(args.directory): for file in files: filename = os.path.join(path, file) (file_format, mime) = Utils.detect_format(filename) if file_format is not None: file_format = file_format.replace('application/', '') filename = filename.replace(args.directory + '/', '') if file_format is not None: if file_format not in res: res[file_format] = [filename] else: res[file_format].append(filename) f_type = '' if args.ftype: f_type = args.ftype tags = '' if args.tags: tags = ','.join(args.tags) for fformat in res.keys(): print '##BIOMAJ#' + fformat + '#' + f_type + '#' + tags + '#' + ','.join( res[fformat])
def wf_copy(self): """ Copy files from offline directory to release directory """ logging.info("Workflow:wf_copy") from_dir = os.path.join(self.session.config.get("data.dir"), self.session.config.get("offline.dir.name")) regexp = self.session.config.get("local.files").split() to_dir = os.path.join( self.session.config.get("data.dir"), self.session.config.get("dir.version"), self.session.get_release_directory(), "flat", ) local_files = Utils.copy_files_with_regexp(from_dir, to_dir, regexp, True) self.session._session["files"] = local_files if len(self.session._session["files"]) == 0: logging.error("Workflow:wf_copy:No file match in offline dir") return False return True
def main(): parser = argparse.ArgumentParser() parser.add_argument("-s", "--scan", dest="directory", help="Directory to scan") parser.add_argument("--type", dest="ftype", help="Files type") parser.add_argument( "--tags", dest="tags", action="append", default=[], help="tags, format key:value, can be repeated multiple times", ) args = parser.parse_args() if not os.path.exists(args.directory): sys.exit(1) res = {} for (path, dirs, files) in os.walk(args.directory): for file in files: filename = os.path.join(path, file) (file_format, mime) = Utils.detect_format(filename) if file_format is not None: file_format = file_format.replace("application/", "") filename = filename.replace(args.directory + "/", "") if file_format is not None: if file_format not in res: res[file_format] = [filename] else: res[file_format].append(filename) f_type = "" if args.ftype: f_type = args.ftype tags = "" if args.tags: tags = ",".join(args.tags) for fformat in res.keys(): print "##BIOMAJ#" + fformat + "#" + f_type + "#" + tags + "#" + ",".join(res[fformat])
def wf_uncompress(self): """ Uncompress files if archives and no.extract = false """ logging.info("Workflow:wf_uncompress") no_extract = self.session.config.get("no.extract") if no_extract is None or no_extract == "false": for file in self.downloaded_files: if "save_as" not in file: file["save_as"] = file["name"] nb_try = 1 not_ok = True while nb_try < 3 and not_ok: status = Utils.uncompress(self.session.get_offline_directory() + "/" + file["save_as"]) if status: not_ok = False else: logging.warn("Workflow:wf_uncompress:Failure:" + file["name"] + ":" + str(nb_try)) nb_try += 1 if not_ok: logging.error("Workflow:wf_uncompress:Failure:" + file["name"]) return False return True
def test_mimes(self): fasta_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),'bank/test2.fasta') (mime, encoding) = Utils.detect_format(fasta_file) self.assertTrue('application/fasta' == mime)
def test_mimes(self): fasta_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'bank/test2.fasta') (mime, encoding) = Utils.detect_format(fasta_file) self.assertTrue('application/fasta' == mime)
def list(self, directory=''): ''' List FTP directory :return: tuple of file and dirs in current directory with details ''' logging.debug('Download:List:' + self.url + self.rootdir + directory) #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory) try: self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory) except Exception as a: self.crl.setopt(pycurl.URL, (self.url + self.rootdir + directory).encode( 'ascii', 'ignore')) if self.proxy is not None: self.crl.setopt(pycurl.PROXY, self.proxy) if self.proxy_auth is not None: self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth) if self.credentials is not None: self.crl.setopt(pycurl.USERPWD, self.credentials) output = BytesIO() # lets assign this buffer to pycurl object self.crl.setopt(pycurl.WRITEFUNCTION, output.write) self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function) self.crl.setopt(pycurl.CONNECTTIMEOUT, 300) # Download should not take more than 5minutes self.crl.setopt(pycurl.TIMEOUT, self.timeout) self.crl.setopt(pycurl.NOSIGNAL, 1) try: self.crl.perform() except Exception as e: logging.error('Could not get errcode:' + str(e)) # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in self.headers: content_type = self.headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'iso-8859-1' # lets get the output in a string result = output.getvalue().decode(encoding) # FTP LIST output is separated by \r\n # lets split the output in lines #lines = result.split(r'[\r\n]+') lines = re.split(r'[\n\r]+', result) # lets walk through each line rfiles = [] rdirs = [] for line in lines: rfile = {} # lets print each part separately parts = line.split() # the individual fields in this list of parts if not parts: continue rfile['permissions'] = parts[0] rfile['group'] = parts[2] rfile['user'] = parts[3] rfile['size'] = parts[4] rfile['month'] = Utils.month_to_num(parts[5]) rfile['day'] = parts[6] rfile['hash'] = hashlib.md5(line.encode('utf-8')).hexdigest() try: rfile['year'] = int(parts[7]) except Exception as e: # specific ftp case issues at getting date info curdate = datetime.now() rfile['year'] = curdate.year # Year not precised, month feater than current means previous year if rfile['month'] > curdate.month: rfile['year'] = curdate.year - 1 # Same month but later day => previous year if rfile['month'] == curdate.month and int( rfile['day']) > curdate.day: rfile['year'] = curdate.year - 1 rfile['name'] = parts[8] if len(parts) >= 10 and parts[9] == '->': # Symlink, add to files AND dirs as we don't know the type of the link rdirs.append(rfile) is_dir = False if re.match('^d', rfile['permissions']): is_dir = True if not is_dir: rfiles.append(rfile) else: rdirs.append(rfile) return (rfiles, rdirs)
def list(self, directory=''): ''' Try to get file headers to get last_modification and size ''' for file in self.files_to_download: self.crl.setopt(pycurl.HEADER, True) if self.credentials is not None: self.crl.setopt(pycurl.USERPWD, self.credentials) if self.proxy is not None: self.crl.setopt(pycurl.PROXY, self.proxy) if self.proxy_auth is not None: curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth) self.crl.setopt(pycurl.NOBODY, True) try: self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name']) except Exception as a: self.crl.setopt(pycurl.URL, (self.url+self.rootdir+file['name']).encode('ascii','ignore')) #self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name']) output = BytesIO() # lets assign this buffer to pycurl object self.crl.setopt(pycurl.WRITEFUNCTION, output.write) self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function) self.crl.perform() # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in self.headers: content_type = self.headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'iso-8859-1' # lets get the output in a string result = output.getvalue().decode(encoding) lines = re.split(r'[\n\r]+', result) for line in lines: parts = line.split(':') if parts[0].strip() == 'Content-Length': file['size'] = parts[1].strip() if parts[0].strip() == 'Last-Modified': # Sun, 06 Nov 1994 res = re.match('(\w+),\s+(\d+)\s+(\w+)\s+(\d+)', parts[1].strip()) if res: file['day'] = res.group(2) file['month'] = Utils.month_to_num(res.group(3)) file['year'] = res.group(4) continue #Sunday, 06-Nov-94 res = re.match('(\w+),\s+(\d+)-(\w+)-(\d+)', parts[1].strip()) if res: file['day'] = res.group(2) file['month'] = Utils.month_to_num(res.group(3)) file['year'] = str(2000 + int(res.group(4))) continue #Sun Nov 6 08:49:37 1994 res = re.match('(\w+)\s+(\w+)\s+(\d+)\s+\d{2}:\d{2}:\d{2}\s+(\d+)', parts[1].strip()) if res: file['day'] = res.group(3) file['month'] = Utils.month_to_num(res.group(2)) file['year'] = res.group(4) continue return (self.files_to_download,[])
def test_copy_with_regexp(self): from_dir = os.path.dirname(os.path.realpath(__file__)) to_dir = self.utils.data_dir Utils.copy_files_with_regexp(from_dir, to_dir, ['.*\.py']) self.assertTrue(os.path.exists(to_dir+'/biomaj_tests.py'))
def test_copy_with_regexp(self): from_dir = os.path.dirname(os.path.realpath(__file__)) to_dir = self.utils.data_dir Utils.copy_files_with_regexp(from_dir, to_dir, ['.*\.py']) self.assertTrue(os.path.exists(to_dir + '/biomaj_tests.py'))
def list(self, directory=''): ''' List FTP directory :return: tuple of file and dirs in current directory with details ''' logging.debug('Download:List:'+self.url+self.rootdir+directory) #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory) try: self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory) except Exception as a: self.crl.setopt(pycurl.URL, (self.url+self.rootdir+directory).encode('ascii','ignore')) if self.proxy is not None: self.crl.setopt(pycurl.PROXY, self.proxy) if self.proxy_auth is not None: curl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth) if self.credentials is not None: self.crl.setopt(pycurl.USERPWD, self.credentials) output = BytesIO() # lets assign this buffer to pycurl object self.crl.setopt(pycurl.WRITEFUNCTION, output.write) self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function) self.crl.perform() # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in self.headers: content_type = self.headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'iso-8859-1' # lets get the output in a string result = output.getvalue().decode(encoding) ''' 'http.parse.dir.line': r'<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})', 'http.parse.file.line': r'<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})', 'http.group.dir.name': 1, 'http.group.dir.date': 2, 'http.group.file.name': 1, 'http.group.file.date': 2, 'http.group.file.size': 3, ''' rfiles = [] rdirs = [] dirs = re.findall(self.config.get('http.parse.dir.line'), result) if dirs is not None and len(dirs)>0: for dir in dirs: rfile = {} rfile['permissions'] = '' rfile['group'] = '' rfile['user'] = '' rfile['size'] = '0' date = dir[int(self.config.get('http.group.dir.date'))-1] dirdate = date.split() parts = dirdate[0].split('-') #19-Jul-2014 13:02 rfile['month'] = Utils.month_to_num(parts[1]) rfile['day'] = parts[0] rfile['year'] = parts[2] rfile['name'] = dir[int(self.config.get('http.group.dir.name'))-1] rdirs.append(rfile) files = re.findall(self.config.get('http.parse.file.line'), result) if files is not None and len(files)>0: for file in files: rfile = {} rfile['permissions'] = '' rfile['group'] = '' rfile['user'] = '' rfile['size'] = file[int(self.config.get('http.group.file.size'))-1] date = file[int(self.config.get('http.group.file.date'))-1] dirdate = date.split() parts = dirdate[0].split('-') #19-Jul-2014 13:02 rfile['month'] = Utils.month_to_num(parts[1]) rfile['day'] = parts[0] rfile['year'] = parts[2] rfile['name'] = file[int(self.config.get('http.group.file.name'))-1] rfiles.append(rfile) return (rfiles, rdirs)
def list(self, directory=''): ''' List FTP directory :return: tuple of file and dirs in current directory with details ''' logging.debug('Download:List:'+self.url+self.rootdir+directory) #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory) try: self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory) except Exception as a: self.crl.setopt(pycurl.URL, (self.url+self.rootdir+directory).encode('ascii', 'ignore')) if self.proxy is not None: self.crl.setopt(pycurl.PROXY, self.proxy) if self.proxy_auth is not None: self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth) if self.credentials is not None: self.crl.setopt(pycurl.USERPWD, self.credentials) output = BytesIO() # lets assign this buffer to pycurl object self.crl.setopt(pycurl.WRITEFUNCTION, output.write) self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function) self.crl.perform() # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in self.headers: content_type = self.headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'iso-8859-1' # lets get the output in a string result = output.getvalue().decode(encoding) # FTP LIST output is separated by \r\n # lets split the output in lines #lines = result.split(r'[\r\n]+') lines = re.split(r'[\n\r]+', result) # lets walk through each line rfiles = [] rdirs = [] for line in lines: rfile = {} # lets print each part separately parts = line.split() # the individual fields in this list of parts if not parts: continue rfile['permissions'] = parts[0] rfile['group'] = parts[2] rfile['user'] = parts[3] rfile['size'] = parts[4] rfile['month'] = Utils.month_to_num(parts[5]) rfile['day'] = parts[6] try: rfile['year'] = int(parts[7]) except Exception as e: # specific ftp case issues at getting date info curdate = datetime.now() rfile['year'] = curdate.year # Year not precised, month feater than current means previous year if rfile['month'] > curdate.month: rfile['year'] = curdate.year - 1 # Same month but later day => previous year if rfile['month'] == curdate.month and int(rfile['day']) > curdate.day: rfile['year'] = curdate.year - 1 rfile['name'] = parts[8] if len(parts) >= 10 and parts[9] == '->': # Symlink, add to files AND dirs as we don't know the type of the link rdirs.append(rfile) is_dir = False if re.match('^d', rfile['permissions']): is_dir = True if not is_dir: rfiles.append(rfile) else: rdirs.append(rfile) return (rfiles, rdirs)
def wf_download(self): """ Download remote files or use an available local copy from last production directory if possible. """ logging.info("Workflow:wf_download") flow = self.get_flow(Workflow.FLOW_DOWNLOAD) downloader = None cf = self.session.config self.session.previous_release = self.session.get("previous_release") if cf.get("protocol") == "multi": """ Search for: protocol = multi remote.file.0.protocol = directftp remote.file.0.server = ftp.ncbi.org remote.file.0.path = /musmusculus/chr1/chr1.fa => http://ftp2.fr.debian.org/debian/README.html?key1=value&key2=value2 remote.file.1.protocol = directhttp remote.file.1.server = ftp2.fr.debian.org remote.file.1.path = debian/README.html remote.file.1.method = GET remote.file.1.params.keys = key1,key2 remote.file.1.params.key1 = value1 remote.file.1.params.key2 = value2 => http://ftp2.fr.debian.org/debian/README.html #POST PARAMS: key1=value key2=value2 remote.file.1.protocol = directhttp remote.file.1.server = ftp2.fr.debian.org remote.file.1.path = debian/README.html remote.file.1.method = POST remote.file.1.params.keys = key1,key2 remote.file.1.params.key1 = value1 remote.file.1.params.key2 = value2 ...... """ downloader = MultiDownload() downloaders = [] # Creates multiple downloaders i = 0 rfile = cf.get("remote.file." + str(i) + ".path") while rfile is not None: if cf.get("remote.file." + str(i) + ".protocol") is not None: protocol = cf.get("remote.file." + str(i) + ".protocol") else: protocol = cf.get("protocol") if cf.get("remote.file." + str(i) + ".server") is not None: server = cf.get("remote.file." + str(i) + ".server") else: server = cf.get("server") subdownloader = self.get_handler(protocol, server, "", [cf.get("remote.file." + str(i) + ".path")]) if cf.get("remote.file." + str(i) + ".credentials") is not None: credentials = cf.get("remote.file." + str(i) + ".credentials") else: credentials = cf.get("server.credentials") if credentials is not None: subdownloader.set_credentials(credentials) if protocol == "directhttp": subdownloader.method = cf.get("remote.file." + str(i) + ".method") if subdownloader.method is None: subdownloader.method = "GET" if cf.get("remote.file." + str(i) + ".name"): subdownloader.save_as = cf.get("remote.file." + str(i) + ".name") else: subdownloader.save_as = cf.get("remote.file." + str(i) + ".path") if cf.get("remote.file." + str(i) + ".method"): subdownloader.method = cf.get("remote.file." + str(i) + ".method").strip().upper() subdownloader.params = {} keys = cf.get("remote.file." + str(i) + ".params.keys") if keys is not None: keys = keys.split(",") for key in keys: param = cf.get("remote.file." + str(i) + ".params." + key.strip()) subdownloader.param[key.strip()] = param.strip() downloaders.append(subdownloader) i += 1 rfile = cf.get("remote.file." + str(i) + ".path") downloader.add_downloaders(downloaders) else: """ Simple case, one downloader with regexp """ protocol = cf.get("protocol") if protocol == "directhttp" or protocol == "directftp": downloader = self.get_handler(cf.get("protocol"), cf.get("server"), "/", [cf.get("remote.dir")[:-1]]) downloader.method = cf.get("url.method") if downloader.method is None: downloader.method = "GET" downloader.save_as = cf.get("target.name") keys = cf.get("url.params") if keys is not None: keys = keys.split(",") for key in keys: param = cf.get(key.strip() + ".value") downloader.param[key.strip()] = param.strip() else: downloader = self.get_handler(cf.get("protocol"), cf.get("server"), cf.get("remote.dir")) if downloader is None: logging.error("Protocol " + cf.get("protocol") + " not supported") return False (file_list, dir_list) = downloader.list() downloader.match(cf.get("remote.files").split(), file_list, dir_list) for f in downloader.files_to_download: f["save_as"] = f["name"] for p in cf.get("remote.files").split(): res = re.match("/" + p, f["name"]) if res is not None and res.groups() is not None and len(res.groups()) >= 1: f["save_as"] = "/".join(res.groups()) break self.session.set("download_files", downloader.files_to_download) if self.session.get("release") is None: # Not defined, or could not get it ealier # Set release to most recent file to download release_dict = Utils.get_more_recent_file(downloader.files_to_download) if release_dict is None: today = datetime.datetime.now() release_dict = {"year": today.year, "month": today.month, "day": today.day} release = str(release_dict["year"]) + "-" + str(release_dict["month"]) + "-" + str(release_dict["day"]) self.session.set("release", release) self.session.set("remoterelease", release) # We restart from scratch, check if directory with this release already exists if self.options.get_option(Options.FROMSCRATCH): index = 0 # Release directory exits, set index to 1 if os.path.exists(self.session.get_full_release_directory()): index = 1 for x in range(1, 100): if os.path.exists(self.session.get_full_release_directory() + "__" + str(x)): index = x + 1 # while os.path.exists(self.session.get_full_release_directory()+'__'+str(index)): # index += 1 # If we found a directory for this release: XX or XX__Y if index > 0: self.session.set("release", release + "__" + str(index)) release = release + "__" + str(index) logging.info("Workflow:wf_download:release:remoterelease:" + self.session.get("remoterelease")) logging.info("Workflow:wf_download:release:release:" + release) MongoConnector.banks.update({"name": self.bank.name}, {"$set": {"status.release.progress": str(release)}}) self.download_go_ahead = False if self.options.get_option(Options.FROM_TASK) == "download": # We want to download again in same release, that's fine, we do not care it is the same release self.download_go_ahead = True if not self.download_go_ahead and self.session.previous_release == self.session.get("remoterelease"): logging.info("Workflow:wf_release:same_as_previous_session") return self.no_need_to_update() self.banks = MongoConnector.banks self.bank.bank = self.banks.find_one({"name": self.name}) nb_prod_dir = len(self.bank.bank["production"]) offline_dir = self.session.get_offline_directory() copied_files = [] # Check if already in offlinedir keep_files = [] if os.path.exists(offline_dir): for file_to_download in downloader.files_to_download: # If file is in offline dir and has same date and size, do not download again if os.path.exists(offline_dir + "/" + file_to_download["name"]): try: file_stat = os.stat(offline_dir + "/" + file_to_download["name"]) f_stat = datetime.datetime.fromtimestamp( os.path.getmtime(offline_dir + "/" + file_to_download["name"]) ) year = str(f_stat.year) month = str(f_stat.month) day = str(f_stat.day) if ( str(file_stat.st_size) != str(file_to_download["size"]) or str(year) != str(file_to_download["year"]) or str(month) != str(file_to_download["month"]) or str(day) != str(file_to_download["day"]) ): logging.debug("Workflow:wf_download:different_from_offline:" + file_to_download["name"]) keep_files.append(file_to_download) else: logging.debug("Workflow:wf_download:offline:" + file_to_download["name"]) except Exception as e: # Could not get stats on file os.remove(offline_dir + "/" + file_to_download["name"]) keep_files.append(file_to_download) else: keep_files.append(file_to_download) downloader.files_to_download = keep_files self.download_go_ahead = False if self.options.get_option(Options.FROM_TASK) == "download": # We want to download again in same release, that's fine, we do not care it is the same release self.download_go_ahead = True if not self.options.get_option(Options.FROMSCRATCH) and not self.download_go_ahead and nb_prod_dir > 0: # for prod in self.bank.bank['production']: # if self.session.get('release') == prod['release']: # logging.info('Workflow:wf_release:same_as_previous_production_dir') # return self.no_need_to_update() # Get last production last_production = self.bank.bank["production"][nb_prod_dir - 1] # Get session corresponding to production directory last_production_session = self.banks.find_one( {"name": self.name, "sessions.id": last_production["session"]}, {"sessions.$": 1} ) last_production_dir = os.path.join( last_production["data_dir"], cf.get("dir.version"), last_production["release"] ) # Checks if some files can be copied instead of downloaded downloader.download_or_copy(last_production_session["sessions"][0]["files"], last_production_dir) if len(downloader.files_to_download) == 0: return self.no_need_to_update() # release_dir = os.path.join(self.session.config.get('data.dir'), # self.session.config.get('dir.version'), # self.session.get_release_directory()) logging.debug("Workflow:wf_download:Copy files from " + last_production_dir) copied_files = downloader.files_to_copy Utils.copy_files(downloader.files_to_copy, offline_dir) downloader.close() DownloadThread.NB_THREAD = int(self.session.config.get("files.num.threads")) if cf.get("protocol") == "multi": thlist = DownloadThread.get_threads_multi(downloader.downloaders, offline_dir) else: thlist = DownloadThread.get_threads(downloader, offline_dir) running_th = [] for th in thlist: running_th.append(th) th.start() while len(running_th) > 0: try: # Join all threads using a timeout so it doesn't block # Filter out threads which have been joined or are None running_th = [t.join(1000) for t in running_th if t is not None and t.isAlive()] logging.debug("Workflow:wf_download:Download:Threads:" + str(running_th)) except KeyboardInterrupt: logging.warn("Ctrl-c received! Sending kill to threads...") logging.warn("Running tasks will continue and process will stop.") for t in running_th: t.downloader.kill_received = True logging.info("Workflow:wf_download:Download:Threads:Over") # for th in thlist: # th.join() is_error = False for th in thlist: if th.error: is_error = True downloader.error = True break self.downloaded_files = downloader.files_to_download + copied_files # self.downloaded_files = downloader.download(offline_dir) + copied_files # downloader.close() if downloader.error: logging.error("An error occured during download") return False return True
def list(self, directory=''): ''' List FTP directory :return: tuple of file and dirs in current directory with details ''' logging.debug('Download:List:' + self.url + self.rootdir + directory) #self.crl.setopt(pycurl.URL, self.url+self.rootdir+directory) try: self.crl.setopt(pycurl.URL, self.url + self.rootdir + directory) except Exception as a: self.crl.setopt(pycurl.URL, (self.url + self.rootdir + directory).encode( 'ascii', 'ignore')) if self.proxy is not None: self.crl.setopt(pycurl.PROXY, self.proxy) if self.proxy_auth is not None: self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth) if self.credentials is not None: self.crl.setopt(pycurl.USERPWD, self.credentials) output = BytesIO() # lets assign this buffer to pycurl object self.crl.setopt(pycurl.WRITEFUNCTION, output.write) self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function) self.crl.perform() # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in self.headers: content_type = self.headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'iso-8859-1' # lets get the output in a string result = output.getvalue().decode(encoding) ''' 'http.parse.dir.line': r'<a[\s]+href="([\S]+)/".*alt="\[DIR\]">.*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})', 'http.parse.file.line': r'<a[\s]+href="([\S]+)".*([\d]{2}-[\w\d]{2,5}-[\d]{4}\s[\d]{2}:[\d]{2})[\s]+([\d\.]+[MKG]{0,1})', 'http.group.dir.name': 1, 'http.group.dir.date': 2, 'http.group.file.name': 1, 'http.group.file.date': 2, 'http.group.file.size': 3, ''' rfiles = [] rdirs = [] dirs = re.findall(self.config.get('http.parse.dir.line'), result) if dirs is not None and len(dirs) > 0: for founddir in dirs: rfile = {} rfile['permissions'] = '' rfile['group'] = '' rfile['user'] = '' rfile['size'] = '0' date = founddir[int(self.config.get('http.group.dir.date')) - 1] dirdate = date.split() parts = dirdate[0].split('-') #19-Jul-2014 13:02 rfile['month'] = Utils.month_to_num(parts[1]) rfile['day'] = parts[0] rfile['year'] = parts[2] rfile['name'] = founddir[ int(self.config.get('http.group.dir.name')) - 1] rdirs.append(rfile) files = re.findall(self.config.get('http.parse.file.line'), result) if files is not None and len(files) > 0: for foundfile in files: rfile = {} rfile['permissions'] = '' rfile['group'] = '' rfile['user'] = '' rfile['size'] = foundfile[ int(self.config.get('http.group.file.size')) - 1] date = foundfile[int(self.config.get('http.group.file.date')) - 1] if self.config.get('http.parse.file.date.format'): date_object = datetime.datetime.strptime( date, self.config.get('http.parse.file.date.format').replace( '%%', '%')) rfile['month'] = date_object.month rfile['day'] = date_object.day rfile['year'] = date_object.year else: dirdate = date.split() parts = dirdate[0].split('-') #19-Jul-2014 13:02 rfile['month'] = Utils.month_to_num(parts[1]) rfile['day'] = parts[0] rfile['year'] = parts[2] rfile['name'] = foundfile[ int(self.config.get('http.group.file.name')) - 1] filehash = (rfile['name'] + str(date) + str(rfile['size'])).encode('utf-8') rfile['hash'] = hashlib.md5(filehash).hexdigest() rfiles.append(rfile) print("###OSALLOU " + str(rfile)) return (rfiles, rdirs)
def list(self, directory=''): ''' Try to get file headers to get last_modification and size ''' for file in self.files_to_download: self.crl.setopt(pycurl.HEADER, True) if self.credentials is not None: self.crl.setopt(pycurl.USERPWD, self.credentials) if self.proxy is not None: self.crl.setopt(pycurl.PROXY, self.proxy) if self.proxy_auth is not None: self.crl.setopt(pycurl.PROXYUSERPWD, self.proxy_auth) self.crl.setopt(pycurl.NOBODY, True) try: self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name']) except Exception as a: self.crl.setopt(pycurl.URL, (self.url+self.rootdir+file['name']).encode('ascii', 'ignore')) #self.crl.setopt(pycurl.URL, self.url+self.rootdir+file['name']) output = BytesIO() # lets assign this buffer to pycurl object self.crl.setopt(pycurl.WRITEFUNCTION, output.write) self.crl.setopt(pycurl.HEADERFUNCTION, self.header_function) self.crl.perform() # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in self.headers: content_type = self.headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'iso-8859-1' # lets get the output in a string result = output.getvalue().decode(encoding) lines = re.split(r'[\n\r]+', result) for line in lines: parts = line.split(':') if parts[0].strip() == 'Content-Length': file['size'] = parts[1].strip() if parts[0].strip() == 'Last-Modified': # Sun, 06 Nov 1994 res = re.match('(\w+),\s+(\d+)\s+(\w+)\s+(\d+)', parts[1].strip()) if res: file['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest() file['day'] = res.group(2) file['month'] = Utils.month_to_num(res.group(3)) file['year'] = res.group(4) continue #Sunday, 06-Nov-94 res = re.match('(\w+),\s+(\d+)-(\w+)-(\d+)', parts[1].strip()) if res: file['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest() file['day'] = res.group(2) file['month'] = Utils.month_to_num(res.group(3)) file['year'] = str(2000 + int(res.group(4))) continue #Sun Nov 6 08:49:37 1994 res = re.match('(\w+)\s+(\w+)\s+(\d+)\s+\d{2}:\d{2}:\d{2}\s+(\d+)', parts[1].strip()) if res: file['hash'] = hashlib.md5(str(res.group(0)).encode('utf-8')).hexdigest() file['day'] = res.group(3) file['month'] = Utils.month_to_num(res.group(2)) file['year'] = res.group(4) continue return (self.files_to_download, [])