Exemple #1
0
    def rebuild(self, db_path, log):
        """
        Download all the files inside a remote directory of the ftp server. If the
        file is already present on the local directory, rewrite it. Do not download
        the file that contain the string '-NRT' in their filename.

        Args:
            - *db_path*: the path of the directory set in the download program.
            - *log*: a logger object from the class Log to print informations on
              the standard output
               
        Returns:
            - *downloaded*: a list of all the downloaded filenames.
        """
        
        # In the following list I will store the name of the
        # files that will be downloaded or updated
        downloaded = []

        # Check if the directory for this harvester is present
        # in the database
        path = join(db_path,relative_path)
        ensure_dir(path, log, expected=False)

        # Open the connection with the remote archive
        connection = FTP(ftp_url)
        connection.login(user=user, passwd=password)

        # Enter in the folder "Intermediate"
        connection.cwd('Intermediate')

        # Enter in "OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060"
        connection.cwd('OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060')

        # Enter in "dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v02"
        connection.cwd('dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v02')
        
        _, years, _ = list_files(connection)
        for year in years:
            connection.cwd(year)
            files, _, perms = list_files(connection)
            for f in files:
                d = download_file(connection, f, path,
                                       log, perms, False)
                if d:
                    downloaded.append(f)
            connection.cwd('..')

        connection.quit()
        return downloaded
Exemple #2
0
    def harvest(self, db_path, log):
        """
        Download all the files inside a remote directory of the ftp server
        whose modification date is after the modification date of the last
        file in the local dir. Do not download the files if the contain
        '-NRT-' in their name.

        Args:
            - *db_path*: the path of the directory set in the download program.
            - *log*: a logger object from the class Log to print informations on
              the standard output
               
        Returns:
            - *downloaded*: a list of all the downloaded filenames.
        """
        # In the following list I will store the name of the
        # files that will be downloaded or updated
        downloaded = []

        # Check if the directory for this harvester is present
        # in the database
        path = join(db_path,relative_path)
        ensure_dir(path, log, expected=True)

        # Open the connection with the remote archive
        connection = FTP(ftp_url)
        connection.login(user=user, passwd=password)

        # Enter in the folder "Intermediate"
        connection.cwd('Intermediate')

        # Enter in "OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060"
        connection.cwd('OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060')

        # Enter in "dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v02"
        connection.cwd('dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v01')
        
        # List all the local files
        loc_files = [f for f in listdir(path) if f !='incomplete_download.tmp']
        
        # If there are no files, download everything
        if len(loc_files)==0:
            log.info('No local files found! Everything will be '
                     'downloaded from the remote repository!')
            _, years, _ = list_files(connection)
            for year in years:
                connection.cwd(year)
                files, _, perms = list_files(connection)
                files_to_be_downloaded = [f for f in files if not '-NRT-' in f]
                for f in files_to_be_downloaded:
                    d = download_file(connection, f, path,
                                      log, perms, False)
                    if d:
                        downloaded.append(f)
                connection.cwd('..')
        else:
            loc_files.sort()
            last_file = loc_files[-1]
            last_year = int(last_file[0:4])
            _, years, _ = list_files(connection)
            new_years = [y for y in years if int(y)>last_year]
            # Enter in the folder with the year of the last downloaded
            # file and download every file which is newer than that
            connection.cwd(str(last_year))
            files, _, perms = list_files(connection)
            files_to_be_downloaded = [f for f in files if not '-NRT-' in f]
            for f in files_to_be_downloaded:
                if f > last_file:
                    d = download_file(connection, f, path,
                                      log, perms, True, True)
                    if d:
                        downloaded.append(f)
            connection.cwd('..')
            # Now we will download what is in the folders of the years
            # after the last file
            for year in new_years:
                connection.cwd(year)
                files, _, perms = list_files(connection)
                files_to_be_downloaded = [f for f in files if not '-NRT-' in f]
                for f in files_to_be_downloaded:
                    d = download_file(connection, f, path,
                                      log, perms, True, True)
                    if d:
                        downloaded.append(f)
                connection.cwd('..')

            # Warning if we found a lot of updates or no updates at all
            if len(downloaded) == 0:
                log.info('No updates found!')
            if len(downloaded) >1 : 
                warn_message = 'Downloaded more than one file:'
                for f in downloaded:
                    warn_message += '\n   - ' + str(f)
                log.info(warn_message, split_lines=False)

        connection.quit()
        return downloaded
    def harvest(self, db_path, log):
        """
        Download all the files inside the remote directories "vessel" and
        "mooring" of the remote ftp server whose modification date is after
        the modification date of the last file in the local dir. Please do
        not put any file in the local directory because this may change the
        date of the last edited file

        Args:
            - *db_path*: the path of the directory set in the download program.
            - *log*: a logger object from the class Log to print informations on
              the standard output
               
        Returns:
            - *downloaded*: a list of all the downloaded filenames.
        """

        # In the following list I will store the name of the
        # files that will be downloaded or updated
        downloaded = []

        # Check if the directory for this harvester is present
        # in the database
        path = join(db_path,relative_path)
        ensure_dir(path, log, expected=True)
        # Check if exists the folder "vessel"
        path_vessel = join(path, "vessel")
        ensure_dir(path_vessel, log, expected=True)
        # Check if exists the folder "mooring"
        path_mooring = join(path, "mooring")
        ensure_dir(path_mooring, log, expected=True)

        # Open the connection with the remote archive
        connection = FTP(ftp_url)
        connection.login(user=user, passwd=password)

        # Enter in the folders
        connection.cwd('Core')
        connection.cwd('INSITU_MED_NRT_OBSERVATIONS_013_035')
        connection.cwd('monthly')
        
        # Now I will download everything from the vessel dir
        connection.cwd('vessel')
        log.debug("Entering in dir vessel")

        # Check the last file we have already downloaded
        already_downloaded = listdir(path_vessel)
        file_dates = [int(l.split('_')[1]) for l in already_downloaded]
        if len(file_dates) == 0:
            last_downloaded = 0
        else:
            last_downloaded = max(file_dates)
        log.debug("Last downloaded file on ??/{0:0>2}/{1:0>4}".format(
                   last_downloaded%100, last_downloaded//100))
                       
        # List all the dirs and take only the one that are generated
        # after the last file downloaded
        _, subdirs, _ = list_files(connection)
        subdirs_to_check = [d for d in subdirs if int(d) >= last_downloaded]

        # Download all the file in that dirs
        for d in sorted(subdirs_to_check):
            log.debug("Entering in dir vessel/" + d)
            connection.cwd(d)
            files, _, perms = list_files(connection)
            for f in files:
                if f[:2] == "MO" and f[-3:]==".nc":
                    d = download_file(connection, f, path_vessel,
                                      log, perms, True, False)
                    if d:
                        downloaded.append(f)
            connection.cwd('..')
        connection.cwd('..')

        # Now the same for the mooring dir
        connection.cwd('mooring')
        log.debug("Entering in dir mooring")

        already_downloaded = listdir(path_mooring)
        file_dates = [int(l.split('_')[1]) for l in already_downloaded if l!='incomplete_download.tmp']
        if len(file_dates) == 0:
            last_downloaded = 0
        else:
            last_downloaded = max(file_dates)
        log.debug("Last downloaded file on ??/{0:0>2}/{1:0>4}".format(
                   last_downloaded%100, last_downloaded//100))
        
        _, subdirs, _ = list_files(connection)
        subdirs_to_check = [d for d in subdirs if int(d) >= last_downloaded]

        for d in sorted(subdirs_to_check):
            log.debug("Entering in dir mooring/" + d)
            connection.cwd(d)
            files, _, perms = list_files(connection)
            for f in files:
                if f[:2] == "MO" and f[-3:]==".nc":
                    d = download_file(connection, f, path_mooring,
                                      log, perms, True, False)
                    if d:
                        downloaded.append(f)
            connection.cwd('..')
        connection.cwd('..')
        
        # At the end, download the index
        connection.cwd('..')
        _, _, perms = list_files(connection)
        download_file(connection, 'index_monthly.txt', path,
                      log, perms, False)

        connection.quit()
        return downloaded
    def rebuild(self, db_path, log):
        """
        Download all the files inside the remote directories "vessel" and
        "mooring" of the remote ftp server. If a file already exists, it
        will be rewritten.

        Args:
            - *db_path*: the path of the directory set in the download program.
            - *log*: a logger object from the class Log to print informations on
              the standard output
               
        Returns:
            - *downloaded*: a list of all the downloaded filenames.
        """

        # In the following list I will store the name of the
        # files that will be downloaded or updated
        downloaded = []

        # Check if the directory for this harvester is present
        # in the database
        path = join(db_path,relative_path)
        ensure_dir(path, log, expected=False)
        # Check if exists the folder "vessel"
        path_vessel = join(path, "vessel")
        ensure_dir(path_vessel, log, expected=False)
        # Check if exists the folder "mooring"
        path_mooring = join(path, "mooring")
        ensure_dir(path_mooring, log, expected=False)


        # Open the connection with the remote archive
        connection = FTP(ftp_url)
        connection.login(user=user, passwd=password)

        connection.cwd('Core')
        connection.cwd('INSITU_MED_NRT_OBSERVATIONS_013_035')
        connection.cwd('monthly')
        
        # Enter in the folder "vessel"
        connection.cwd('vessel')
        log.debug("Entering in dir vessel")

        # For every subdir, download every netcdf file whose
        # name starts with "MO" and put it in the vessel
        _, subdirs, _ = list_files(connection)
        for d in sorted(subdirs):
            log.debug("Entering in dir vessel/" + d)
            connection.cwd(d)
            files, _, perms = list_files(connection)
            for f in files:
                if f[:2] == "MO" and f[-3:]==".nc":
                    d = download_file(connection, f, path_vessel,
                                      log, perms, False)
                    if d:
                        downloaded.append(f)
            connection.cwd('..')
        connection.cwd('..')
        
        # The same for the other dir
        connection.cwd('mooring')
        log.debug("Entering in dir mooring")

        _, subdirs, _ = list_files(connection)
        for d in sorted(subdirs):
            log.debug("Entering in dir mooring/" + d)
            connection.cwd(d)
            files, _, perms = list_files(connection)
            for f in files:
                if f[:2] == "MO" and f[-3:]==".nc":
                    d = download_file(connection, f, path_mooring,
                                      log, perms, False)
                    if d:
                        downloaded.append(f)
            connection.cwd('..')
        connection.cwd('..')

        # At the end, download the index
        connection.cwd('..')
        _, _, perms = list_files(connection)
        download_file(connection, 'index_monthly.txt', path,
                      log, perms, False)

        connection.quit()
        return downloaded