def rebuild(self, db_path, log): """ Download all the files inside a remote directory of the ftp server. If the file is already present on the local directory, rewrite it. Do not download the file that contain the string '-NRT' in their filename. Args: - *db_path*: the path of the directory set in the download program. - *log*: a logger object from the class Log to print informations on the standard output Returns: - *downloaded*: a list of all the downloaded filenames. """ # In the following list I will store the name of the # files that will be downloaded or updated downloaded = [] # Check if the directory for this harvester is present # in the database path = join(db_path,relative_path) ensure_dir(path, log, expected=False) # Open the connection with the remote archive connection = FTP(ftp_url) connection.login(user=user, passwd=password) # Enter in the folder "Intermediate" connection.cwd('Intermediate') # Enter in "OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060" connection.cwd('OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060') # Enter in "dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v02" connection.cwd('dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v02') _, years, _ = list_files(connection) for year in years: connection.cwd(year) files, _, perms = list_files(connection) for f in files: d = download_file(connection, f, path, log, perms, False) if d: downloaded.append(f) connection.cwd('..') connection.quit() return downloaded
def rebuild(self, db_path, log, skip_if_present=False): """ For every float in the file wmo, download every data file related to that float that starts with 'MR'. Then create a xml file with the data read from the wmo file. Args: - *db_path*: the path of the directory set in the download program. - *log*: a logger object from the class Log to print informations on the standard output - *skip_if_present*: A boolean value that set if not download again the files that are saved on the local directory. By defalut is False Returns: - *downloaded*: a list of all the downloaded filenames. """ # In the following list I will store the name of the # files that will be downloaded or updated print("REBUILD") downloaded = [] # Read the wmo file line by line (exclude the first one because # it does not contain data) A=self.wmo_file_reader() # Delete, if present, the XML files with all the floats xml_file = join(xml_path, self.__class__.__name__ + '.xml') if exists(xml_file): remove(xml_file) # and create a new one (in memory) root = xml_tree.Element("BioFloats") root.set('Updated', now_as_string()) tree = xml_tree.ElementTree(root) # Check if the directory for this harvester is present # in the database path = join(db_path,relative_path) ensure_dir(path, log, expected=False) # Open the connection with the remote archive # Download data for every active float for l in range(len(A)): f = A[l]['wmo'] floatname = A[l]['nome_fs'].replace(' ','') if not self.is_a_lov_float(f, floatname): continue # Update the xml with the current status of the float f_in_xml = root.findall('wmo_' + str(f)) if len(f_in_xml) == 0: f_node = xml_tree.SubElement(root, 'wmo_' + str(f)) else: f_node = [fn for fn in f_in_xml if fn.tag=='wmo_'+str(f)][0] f_node.set('status', A[l]['status']) try: urlfilelist = http_url + floatname + "/liste_all" print(urlfilelist) response = urllib2.urlopen(urlfilelist) except: log.info('Cannot download file ' + urlfilelist + '. This file will be skipped!') continue remotepathlist = response.read().rsplit("\n")[:-1] filelist=[os.path.basename(fn) for fn in remotepathlist] # Now I look for the profiles dir. This is the folder # where all the data are stored if len(filelist) > 0: download_for_f = [] # Copy all file in a local dir with the same name # skipping the one that we already have float_local_dir = join(path, f) print(float_local_dir) ensure_dir(float_local_dir, log, expected = False) for ff in filelist: url = http_url + floatname + "/" + ff d = download_file(url, ff, float_local_dir, log, None, True) # If the file was downloaded without any problem, # add it to the list of downloaded files if d: downloaded.append(ff) download_for_f.append(ff) if len(download_for_f) == 0: log.info('No updates found for float ' + str(f)) else: log.info('No updates found for float ' + str(f)) # Save the XML file xml_as_string = xml_tree.tostring(root) xml_rebuild = parseString(xml_as_string) pretty_xml = xml_rebuild.toprettyxml(indent=' ') pretty_xml_lines = pretty_xml.split('\n') pretty_xml = "\n".join([l for l in pretty_xml_lines if l.strip()]) ensure_dir(xml_path, log, expected=False) with open(xml_file, 'w') as xml_f: xml_f.write(pretty_xml) # Return the list of downloaded files return downloaded
def harvest(self, db_path, log): """ For every float in the file wmo, check the status in the wmo_file and in the xml one. If in at least one file the float is reported as active, then check the last file downloaded for that wmo and download every file on the server that is more recent than the one already downloaded. Then update the xml file with the status reported in the wmo file. Args: - *db_path*: the path of the directory set in the download program. - *log*: a logger object from the class Log to print informations on the standard output Returns: - *downloaded*: a list of all the downloaded filenames. """ # In the following list I will store the name of the # files that will be downloaded or updated print("HARVEST") downloaded = [] # Read the wmo file A = self.wmo_file_reader() lines_active_floats=np.where(A['status']=='A')[0] lines_dead__floats =np.where(A['status']=='D')[0] # Now we need the xml file that keeps what we did on the # last updates xml_file = join(xml_path, self.__class__.__name__ + '.xml') try: tree = xml_tree.parse(xml_file) except: log.info('XML file not found or not readable. ' 'This script will update every file ' 'from the remote archive. This is ' 'almost the same than run in reset ' 'mode, but the files that exist will ' 'not be downloaded again. Moreover, ' 'the XML file will be rewritten.') return self.rebuild(db_path, log, skip_if_present=True) root = tree.getroot() # Check if the directory for this harvester is present # in the database path = join(db_path,relative_path) ensure_dir(path, log, expected=True) # Open the connection with the remote archive # Enter in the directory tree # Download data for every active float for l in lines_active_floats: # Update the xml with the current status of the float f = A[l]['wmo'] floatname = A[l]['nome_fs'].replace(' ','') if not self.is_a_lov_float(f, floatname): continue wmo_in_xml = 'wmo_' + str(f) f_in_xml = root.findall(wmo_in_xml) if len(f_in_xml) == 0: f_node = xml_tree.SubElement(root, wmo_in_xml) else: f_node = [fn for fn in f_in_xml if fn.tag==wmo_in_xml][0] f_node.set('status', 'A') try: urlfilelist = http_url + floatname + "/liste_all" print(urlfilelist) response = urllib2.urlopen(urlfilelist) except: log.info('No directory associated with file ' + str(f) + '. This file will be skipped!') continue remotepathlist = response.read().rsplit("\n")[:-1] filelist=[os.path.basename(fn) for fn in remotepathlist] # Now I look for the profiles dir. This is the folder # where all the data are stored if len(filelist) > 0: download_for_f = [] # Copy all file in a local dir with the same name # skipping the one that we already have float_local_dir = join(path, f) ensure_dir(float_local_dir, log, expected = False) for ff in filelist: url = http_url + floatname + "/" + ff d = download_file(url, ff, float_local_dir, log, None, True) # If the file was downloaded without any problem, # add it to the list of downloaded files if d: downloaded.append(ff) download_for_f.append(ff) if len(download_for_f) == 0: log.info('No updates found for float ' + str(f)) else: log.info('No updates found for float ' + str(f)) print ("DIED FLOATS") for l in lines_dead__floats: f = A[l]['wmo'] floatname = A[l]['nome_fs'].replace(' ','') if not self.is_a_lov_float(f, floatname): continue to_be_updated = False # Update the xml with the current status of the float # Check if it must be updated f_in_xml = root.findall('wmo_' + str(f)) if len(f_in_xml) == 0: # If this float is new, then add it to the archive # and it will be updated to_be_updated = True f_node = xml_tree.SubElement(root, 'wmo_' + str(f)) else: f_node = [fn for fn in f_in_xml if fn.tag=='wmo_'+str(f)][0] # If I already know this float, but the last time it # was not dead, update it if f_node.get('status') != 'D': to_be_updated = True f_node.set('status', 'D') if not to_be_updated: log.debug("Wmo " + str(f) + " is dead and will not be updated") else: log.debug("Wmo " + str(f) + " now is dead but was active on " "the last run and will be updated anyway") if to_be_updated: try: urlfilelist = http_url + floatname + "/liste_all" print(urlfilelist) response = urllib2.urlopen(urlfilelist) except: log.info('No directory associated with file ' + str(f) + '. This file will be skipped!') continue remotepathlist = response.read().rsplit("\n")[:-1] filelist=[os.path.basename(fn) for fn in remotepathlist] # Now I look for the profiles dir. This is the folder # where all the data are stored if len(filelist) > 0: download_for_f = [] # Copy all file in a local dir with the same name # skipping the one that we already have float_local_dir = join(path, f) ensure_dir(float_local_dir, log, expected = False) for ff in filelist: url = http_url + floatname + "/" + ff d = download_file(url, ff, float_local_dir, log, None, True) # If the file was downloaded without any problem, # add it to the list of downloaded files if d: downloaded.append(ff) if len(download_for_f) == 0: log.info('No updates found for float ' + str(f)) else: log.info('No updates found for float ' + str(f)) # Save the XML file root.set('Updated', now_as_string()) xml_as_string = xml_tree.tostring(root) xml_rebuild = parseString(xml_as_string) pretty_xml = xml_rebuild.toprettyxml(indent=' ') pretty_xml_lines = pretty_xml.split('\n') pretty_xml = "\n".join([l for l in pretty_xml_lines if l.strip()]) with open(xml_file, 'w') as xml_f: xml_f.write(pretty_xml) # Return the list of downloaded files return downloaded
def harvest(self, db_path, log): """ Download all the files inside a remote directory of the ftp server whose modification date is after the modification date of the last file in the local dir. Do not download the files if the contain '-NRT-' in their name. Args: - *db_path*: the path of the directory set in the download program. - *log*: a logger object from the class Log to print informations on the standard output Returns: - *downloaded*: a list of all the downloaded filenames. """ # In the following list I will store the name of the # files that will be downloaded or updated downloaded = [] # Check if the directory for this harvester is present # in the database path = join(db_path,relative_path) ensure_dir(path, log, expected=True) # Open the connection with the remote archive connection = FTP(ftp_url) connection.login(user=user, passwd=password) # Enter in the folder "Intermediate" connection.cwd('Intermediate') # Enter in "OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060" connection.cwd('OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060') # Enter in "dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v02" connection.cwd('dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v01') # List all the local files loc_files = [f for f in listdir(path) if f !='incomplete_download.tmp'] # If there are no files, download everything if len(loc_files)==0: log.info('No local files found! Everything will be ' 'downloaded from the remote repository!') _, years, _ = list_files(connection) for year in years: connection.cwd(year) files, _, perms = list_files(connection) files_to_be_downloaded = [f for f in files if not '-NRT-' in f] for f in files_to_be_downloaded: d = download_file(connection, f, path, log, perms, False) if d: downloaded.append(f) connection.cwd('..') else: loc_files.sort() last_file = loc_files[-1] last_year = int(last_file[0:4]) _, years, _ = list_files(connection) new_years = [y for y in years if int(y)>last_year] # Enter in the folder with the year of the last downloaded # file and download every file which is newer than that connection.cwd(str(last_year)) files, _, perms = list_files(connection) files_to_be_downloaded = [f for f in files if not '-NRT-' in f] for f in files_to_be_downloaded: if f > last_file: d = download_file(connection, f, path, log, perms, True, True) if d: downloaded.append(f) connection.cwd('..') # Now we will download what is in the folders of the years # after the last file for year in new_years: connection.cwd(year) files, _, perms = list_files(connection) files_to_be_downloaded = [f for f in files if not '-NRT-' in f] for f in files_to_be_downloaded: d = download_file(connection, f, path, log, perms, True, True) if d: downloaded.append(f) connection.cwd('..') # Warning if we found a lot of updates or no updates at all if len(downloaded) == 0: log.info('No updates found!') if len(downloaded) >1 : warn_message = 'Downloaded more than one file:' for f in downloaded: warn_message += '\n - ' + str(f) log.info(warn_message, split_lines=False) connection.quit() return downloaded
def harvest(self, db_path, log): """ Download all the files inside the remote directories "vessel" and "mooring" of the remote ftp server whose modification date is after the modification date of the last file in the local dir. Please do not put any file in the local directory because this may change the date of the last edited file Args: - *db_path*: the path of the directory set in the download program. - *log*: a logger object from the class Log to print informations on the standard output Returns: - *downloaded*: a list of all the downloaded filenames. """ # In the following list I will store the name of the # files that will be downloaded or updated downloaded = [] # Check if the directory for this harvester is present # in the database path = join(db_path,relative_path) ensure_dir(path, log, expected=True) # Check if exists the folder "vessel" path_vessel = join(path, "vessel") ensure_dir(path_vessel, log, expected=True) # Check if exists the folder "mooring" path_mooring = join(path, "mooring") ensure_dir(path_mooring, log, expected=True) # Open the connection with the remote archive connection = FTP(ftp_url) connection.login(user=user, passwd=password) # Enter in the folders connection.cwd('Core') connection.cwd('INSITU_MED_NRT_OBSERVATIONS_013_035') connection.cwd('monthly') # Now I will download everything from the vessel dir connection.cwd('vessel') log.debug("Entering in dir vessel") # Check the last file we have already downloaded already_downloaded = listdir(path_vessel) file_dates = [int(l.split('_')[1]) for l in already_downloaded] if len(file_dates) == 0: last_downloaded = 0 else: last_downloaded = max(file_dates) log.debug("Last downloaded file on ??/{0:0>2}/{1:0>4}".format( last_downloaded%100, last_downloaded//100)) # List all the dirs and take only the one that are generated # after the last file downloaded _, subdirs, _ = list_files(connection) subdirs_to_check = [d for d in subdirs if int(d) >= last_downloaded] # Download all the file in that dirs for d in sorted(subdirs_to_check): log.debug("Entering in dir vessel/" + d) connection.cwd(d) files, _, perms = list_files(connection) for f in files: if f[:2] == "MO" and f[-3:]==".nc": d = download_file(connection, f, path_vessel, log, perms, True, False) if d: downloaded.append(f) connection.cwd('..') connection.cwd('..') # Now the same for the mooring dir connection.cwd('mooring') log.debug("Entering in dir mooring") already_downloaded = listdir(path_mooring) file_dates = [int(l.split('_')[1]) for l in already_downloaded if l!='incomplete_download.tmp'] if len(file_dates) == 0: last_downloaded = 0 else: last_downloaded = max(file_dates) log.debug("Last downloaded file on ??/{0:0>2}/{1:0>4}".format( last_downloaded%100, last_downloaded//100)) _, subdirs, _ = list_files(connection) subdirs_to_check = [d for d in subdirs if int(d) >= last_downloaded] for d in sorted(subdirs_to_check): log.debug("Entering in dir mooring/" + d) connection.cwd(d) files, _, perms = list_files(connection) for f in files: if f[:2] == "MO" and f[-3:]==".nc": d = download_file(connection, f, path_mooring, log, perms, True, False) if d: downloaded.append(f) connection.cwd('..') connection.cwd('..') # At the end, download the index connection.cwd('..') _, _, perms = list_files(connection) download_file(connection, 'index_monthly.txt', path, log, perms, False) connection.quit() return downloaded
def rebuild(self, db_path, log): """ Download all the files inside the remote directories "vessel" and "mooring" of the remote ftp server. If a file already exists, it will be rewritten. Args: - *db_path*: the path of the directory set in the download program. - *log*: a logger object from the class Log to print informations on the standard output Returns: - *downloaded*: a list of all the downloaded filenames. """ # In the following list I will store the name of the # files that will be downloaded or updated downloaded = [] # Check if the directory for this harvester is present # in the database path = join(db_path,relative_path) ensure_dir(path, log, expected=False) # Check if exists the folder "vessel" path_vessel = join(path, "vessel") ensure_dir(path_vessel, log, expected=False) # Check if exists the folder "mooring" path_mooring = join(path, "mooring") ensure_dir(path_mooring, log, expected=False) # Open the connection with the remote archive connection = FTP(ftp_url) connection.login(user=user, passwd=password) connection.cwd('Core') connection.cwd('INSITU_MED_NRT_OBSERVATIONS_013_035') connection.cwd('monthly') # Enter in the folder "vessel" connection.cwd('vessel') log.debug("Entering in dir vessel") # For every subdir, download every netcdf file whose # name starts with "MO" and put it in the vessel _, subdirs, _ = list_files(connection) for d in sorted(subdirs): log.debug("Entering in dir vessel/" + d) connection.cwd(d) files, _, perms = list_files(connection) for f in files: if f[:2] == "MO" and f[-3:]==".nc": d = download_file(connection, f, path_vessel, log, perms, False) if d: downloaded.append(f) connection.cwd('..') connection.cwd('..') # The same for the other dir connection.cwd('mooring') log.debug("Entering in dir mooring") _, subdirs, _ = list_files(connection) for d in sorted(subdirs): log.debug("Entering in dir mooring/" + d) connection.cwd(d) files, _, perms = list_files(connection) for f in files: if f[:2] == "MO" and f[-3:]==".nc": d = download_file(connection, f, path_mooring, log, perms, False) if d: downloaded.append(f) connection.cwd('..') connection.cwd('..') # At the end, download the index connection.cwd('..') _, _, perms = list_files(connection) download_file(connection, 'index_monthly.txt', path, log, perms, False) connection.quit() return downloaded