def rebuild(self, db_path, log): """ Download all the files inside a remote directory of the ftp server. If the file is already present on the local directory, rewrite it. Do not download the file that contain the string '-NRT' in their filename. Args: - *db_path*: the path of the directory set in the download program. - *log*: a logger object from the class Log to print informations on the standard output Returns: - *downloaded*: a list of all the downloaded filenames. """ # In the following list I will store the name of the # files that will be downloaded or updated downloaded = [] # Check if the directory for this harvester is present # in the database path = join(db_path,relative_path) ensure_dir(path, log, expected=False) # Open the connection with the remote archive connection = FTP(ftp_url) connection.login(user=user, passwd=password) # Enter in the folder "Intermediate" connection.cwd('Intermediate') # Enter in "OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060" connection.cwd('OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060') # Enter in "dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v02" connection.cwd('dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v02') _, years, _ = list_files(connection) for year in years: connection.cwd(year) files, _, perms = list_files(connection) for f in files: d = download_file(connection, f, path, log, perms, False) if d: downloaded.append(f) connection.cwd('..') connection.quit() return downloaded
def harvest(self, db_path, log): """ Download all the files inside a remote directory of the ftp server whose modification date is after the modification date of the last file in the local dir. Do not download the files if the contain '-NRT-' in their name. Args: - *db_path*: the path of the directory set in the download program. - *log*: a logger object from the class Log to print informations on the standard output Returns: - *downloaded*: a list of all the downloaded filenames. """ # In the following list I will store the name of the # files that will be downloaded or updated downloaded = [] # Check if the directory for this harvester is present # in the database path = join(db_path,relative_path) ensure_dir(path, log, expected=True) # Open the connection with the remote archive connection = FTP(ftp_url) connection.login(user=user, passwd=password) # Enter in the folder "Intermediate" connection.cwd('Intermediate') # Enter in "OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060" connection.cwd('OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060') # Enter in "dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v02" connection.cwd('dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v01') # List all the local files loc_files = [f for f in listdir(path) if f !='incomplete_download.tmp'] # If there are no files, download everything if len(loc_files)==0: log.info('No local files found! Everything will be ' 'downloaded from the remote repository!') _, years, _ = list_files(connection) for year in years: connection.cwd(year) files, _, perms = list_files(connection) files_to_be_downloaded = [f for f in files if not '-NRT-' in f] for f in files_to_be_downloaded: d = download_file(connection, f, path, log, perms, False) if d: downloaded.append(f) connection.cwd('..') else: loc_files.sort() last_file = loc_files[-1] last_year = int(last_file[0:4]) _, years, _ = list_files(connection) new_years = [y for y in years if int(y)>last_year] # Enter in the folder with the year of the last downloaded # file and download every file which is newer than that connection.cwd(str(last_year)) files, _, perms = list_files(connection) files_to_be_downloaded = [f for f in files if not '-NRT-' in f] for f in files_to_be_downloaded: if f > last_file: d = download_file(connection, f, path, log, perms, True, True) if d: downloaded.append(f) connection.cwd('..') # Now we will download what is in the folders of the years # after the last file for year in new_years: connection.cwd(year) files, _, perms = list_files(connection) files_to_be_downloaded = [f for f in files if not '-NRT-' in f] for f in files_to_be_downloaded: d = download_file(connection, f, path, log, perms, True, True) if d: downloaded.append(f) connection.cwd('..') # Warning if we found a lot of updates or no updates at all if len(downloaded) == 0: log.info('No updates found!') if len(downloaded) >1 : warn_message = 'Downloaded more than one file:' for f in downloaded: warn_message += '\n - ' + str(f) log.info(warn_message, split_lines=False) connection.quit() return downloaded
def harvest(self, db_path, log): """ Download all the files inside the remote directories "vessel" and "mooring" of the remote ftp server whose modification date is after the modification date of the last file in the local dir. Please do not put any file in the local directory because this may change the date of the last edited file Args: - *db_path*: the path of the directory set in the download program. - *log*: a logger object from the class Log to print informations on the standard output Returns: - *downloaded*: a list of all the downloaded filenames. """ # In the following list I will store the name of the # files that will be downloaded or updated downloaded = [] # Check if the directory for this harvester is present # in the database path = join(db_path,relative_path) ensure_dir(path, log, expected=True) # Check if exists the folder "vessel" path_vessel = join(path, "vessel") ensure_dir(path_vessel, log, expected=True) # Check if exists the folder "mooring" path_mooring = join(path, "mooring") ensure_dir(path_mooring, log, expected=True) # Open the connection with the remote archive connection = FTP(ftp_url) connection.login(user=user, passwd=password) # Enter in the folders connection.cwd('Core') connection.cwd('INSITU_MED_NRT_OBSERVATIONS_013_035') connection.cwd('monthly') # Now I will download everything from the vessel dir connection.cwd('vessel') log.debug("Entering in dir vessel") # Check the last file we have already downloaded already_downloaded = listdir(path_vessel) file_dates = [int(l.split('_')[1]) for l in already_downloaded] if len(file_dates) == 0: last_downloaded = 0 else: last_downloaded = max(file_dates) log.debug("Last downloaded file on ??/{0:0>2}/{1:0>4}".format( last_downloaded%100, last_downloaded//100)) # List all the dirs and take only the one that are generated # after the last file downloaded _, subdirs, _ = list_files(connection) subdirs_to_check = [d for d in subdirs if int(d) >= last_downloaded] # Download all the file in that dirs for d in sorted(subdirs_to_check): log.debug("Entering in dir vessel/" + d) connection.cwd(d) files, _, perms = list_files(connection) for f in files: if f[:2] == "MO" and f[-3:]==".nc": d = download_file(connection, f, path_vessel, log, perms, True, False) if d: downloaded.append(f) connection.cwd('..') connection.cwd('..') # Now the same for the mooring dir connection.cwd('mooring') log.debug("Entering in dir mooring") already_downloaded = listdir(path_mooring) file_dates = [int(l.split('_')[1]) for l in already_downloaded if l!='incomplete_download.tmp'] if len(file_dates) == 0: last_downloaded = 0 else: last_downloaded = max(file_dates) log.debug("Last downloaded file on ??/{0:0>2}/{1:0>4}".format( last_downloaded%100, last_downloaded//100)) _, subdirs, _ = list_files(connection) subdirs_to_check = [d for d in subdirs if int(d) >= last_downloaded] for d in sorted(subdirs_to_check): log.debug("Entering in dir mooring/" + d) connection.cwd(d) files, _, perms = list_files(connection) for f in files: if f[:2] == "MO" and f[-3:]==".nc": d = download_file(connection, f, path_mooring, log, perms, True, False) if d: downloaded.append(f) connection.cwd('..') connection.cwd('..') # At the end, download the index connection.cwd('..') _, _, perms = list_files(connection) download_file(connection, 'index_monthly.txt', path, log, perms, False) connection.quit() return downloaded
def rebuild(self, db_path, log): """ Download all the files inside the remote directories "vessel" and "mooring" of the remote ftp server. If a file already exists, it will be rewritten. Args: - *db_path*: the path of the directory set in the download program. - *log*: a logger object from the class Log to print informations on the standard output Returns: - *downloaded*: a list of all the downloaded filenames. """ # In the following list I will store the name of the # files that will be downloaded or updated downloaded = [] # Check if the directory for this harvester is present # in the database path = join(db_path,relative_path) ensure_dir(path, log, expected=False) # Check if exists the folder "vessel" path_vessel = join(path, "vessel") ensure_dir(path_vessel, log, expected=False) # Check if exists the folder "mooring" path_mooring = join(path, "mooring") ensure_dir(path_mooring, log, expected=False) # Open the connection with the remote archive connection = FTP(ftp_url) connection.login(user=user, passwd=password) connection.cwd('Core') connection.cwd('INSITU_MED_NRT_OBSERVATIONS_013_035') connection.cwd('monthly') # Enter in the folder "vessel" connection.cwd('vessel') log.debug("Entering in dir vessel") # For every subdir, download every netcdf file whose # name starts with "MO" and put it in the vessel _, subdirs, _ = list_files(connection) for d in sorted(subdirs): log.debug("Entering in dir vessel/" + d) connection.cwd(d) files, _, perms = list_files(connection) for f in files: if f[:2] == "MO" and f[-3:]==".nc": d = download_file(connection, f, path_vessel, log, perms, False) if d: downloaded.append(f) connection.cwd('..') connection.cwd('..') # The same for the other dir connection.cwd('mooring') log.debug("Entering in dir mooring") _, subdirs, _ = list_files(connection) for d in sorted(subdirs): log.debug("Entering in dir mooring/" + d) connection.cwd(d) files, _, perms = list_files(connection) for f in files: if f[:2] == "MO" and f[-3:]==".nc": d = download_file(connection, f, path_mooring, log, perms, False) if d: downloaded.append(f) connection.cwd('..') connection.cwd('..') # At the end, download the index connection.cwd('..') _, _, perms = list_files(connection) download_file(connection, 'index_monthly.txt', path, log, perms, False) connection.quit() return downloaded