Exemple #1
0
    def rebuild(self, db_path, log):
        """
        Download all the files inside a remote directory of the ftp server. If the
        file is already present on the local directory, rewrite it. Do not download
        the file that contain the string '-NRT' in their filename.

        Args:
            - *db_path*: the path of the directory set in the download program.
            - *log*: a logger object from the class Log to print informations on
              the standard output
               
        Returns:
            - *downloaded*: a list of all the downloaded filenames.
        """
        
        # In the following list I will store the name of the
        # files that will be downloaded or updated
        downloaded = []

        # Check if the directory for this harvester is present
        # in the database
        path = join(db_path,relative_path)
        ensure_dir(path, log, expected=False)

        # Open the connection with the remote archive
        connection = FTP(ftp_url)
        connection.login(user=user, passwd=password)

        # Enter in the folder "Intermediate"
        connection.cwd('Intermediate')

        # Enter in "OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060"
        connection.cwd('OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060')

        # Enter in "dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v02"
        connection.cwd('dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v02')
        
        _, years, _ = list_files(connection)
        for year in years:
            connection.cwd(year)
            files, _, perms = list_files(connection)
            for f in files:
                d = download_file(connection, f, path,
                                       log, perms, False)
                if d:
                    downloaded.append(f)
            connection.cwd('..')

        connection.quit()
        return downloaded
    def rebuild(self, db_path, log, skip_if_present=False):
        """
        For every float in the file wmo, download every data file related to
        that float that starts with 'MR'. Then create a xml file with the
        data read from the wmo file.

        Args:
            - *db_path*: the path of the directory set in the download program.
            - *log*: a logger object from the class Log to print informations on
              the standard output
            - *skip_if_present*: A boolean value that set if not download again 
              the files that are saved on the local directory. By defalut is False
               
        Returns:
            - *downloaded*: a list of all the downloaded filenames.
        """
        # In the following list I will store the name of the
        # files that will be downloaded or updated
        print("REBUILD")
        downloaded = []

        # Read the wmo file line by line (exclude the first one because
        # it does not contain data)
        A=self.wmo_file_reader()


        # Delete, if present, the XML files with all the floats
        xml_file = join(xml_path, self.__class__.__name__ + '.xml')
        if exists(xml_file):
            remove(xml_file)
        # and create a new one (in memory)
        root = xml_tree.Element("BioFloats")
        root.set('Updated', now_as_string())
        tree = xml_tree.ElementTree(root)

        # Check if the directory for this harvester is present
        # in the database
        path = join(db_path,relative_path)
        ensure_dir(path, log, expected=False)

        # Open the connection with the remote archive

        # Download data for every active float
        for l in range(len(A)):
            f = A[l]['wmo']
            floatname = A[l]['nome_fs'].replace(' ','')
            if not self.is_a_lov_float(f, floatname): continue


            # Update the xml with the current status of the float
            f_in_xml = root.findall('wmo_' + str(f))
            if len(f_in_xml) == 0:
                f_node = xml_tree.SubElement(root, 'wmo_' + str(f))

            else:
                f_node = [fn for fn in f_in_xml if fn.tag=='wmo_'+str(f)][0]
            f_node.set('status', A[l]['status'])

            try:
                urlfilelist = http_url + floatname +  "/liste_all"
                print(urlfilelist)
                response = urllib2.urlopen(urlfilelist)
            except:
                log.info('Cannot download file ' + urlfilelist +
                         '. This file will be skipped!')
                continue

            remotepathlist = response.read().rsplit("\n")[:-1]
            filelist=[os.path.basename(fn) for fn in remotepathlist]
            # Now I look for the profiles dir. This is the folder
            # where all the data are stored
            if len(filelist) > 0:
                download_for_f = []
                # Copy all file in a local dir with the same name
                # skipping the one that we already have
                float_local_dir = join(path, f)

                print(float_local_dir)
                ensure_dir(float_local_dir, log, expected = False)
                for ff in filelist:
                    url = http_url + floatname + "/" + ff
                    d = download_file(url, ff, float_local_dir,
                                      log, None, True)
                    # If the file was downloaded without any problem,
                    # add it to the list of downloaded files
                    if d:
                        downloaded.append(ff)
                        download_for_f.append(ff)
                if len(download_for_f) == 0:
                    log.info('No updates found for float ' + str(f))                    
            else:
                log.info('No updates found for float ' + str(f))



        
        # Save the XML file
        xml_as_string = xml_tree.tostring(root)
        xml_rebuild = parseString(xml_as_string)
        pretty_xml = xml_rebuild.toprettyxml(indent='  ')
        pretty_xml_lines = pretty_xml.split('\n')
        pretty_xml = "\n".join([l for l in pretty_xml_lines if l.strip()])

        ensure_dir(xml_path, log, expected=False)
        with open(xml_file, 'w') as xml_f:
            xml_f.write(pretty_xml)

        # Return the list of downloaded files
        return downloaded
    def harvest(self, db_path, log):
        """
        For every float in the file wmo, check the status in the wmo_file and
        in the xml one. If in at least one file the float is reported as active,
        then check the last file downloaded for that wmo and download every file
        on the server that is more recent than the one already downloaded. Then
        update the xml file with the status reported in the wmo file.

        Args:
            - *db_path*: the path of the directory set in the download program.
            - *log*: a logger object from the class Log to print informations on
              the standard output
               
        Returns:
            - *downloaded*: a list of all the downloaded filenames.
        """
        # In the following list I will store the name of the
        # files that will be downloaded or updated
        print("HARVEST")
        downloaded = []

        # Read the wmo file

        A = self.wmo_file_reader()
        lines_active_floats=np.where(A['status']=='A')[0]
        lines_dead__floats =np.where(A['status']=='D')[0]


        # Now we need the xml file that keeps what we did on the
        # last updates
        xml_file = join(xml_path, self.__class__.__name__ + '.xml')
        try:
            tree = xml_tree.parse(xml_file)
        except:
            log.info('XML file not found or not readable. '
                     'This script will update every file '
                     'from the remote archive. This is '
                     'almost the same than run in reset '
                     'mode, but the files that exist will '
                     'not be downloaded again. Moreover, '
                     'the XML file will be rewritten.')
            return self.rebuild(db_path, log, skip_if_present=True)

        root = tree.getroot()

        # Check if the directory for this harvester is present
        # in the database
        path = join(db_path,relative_path)
        ensure_dir(path, log, expected=True)

        # Open the connection with the remote archive

        
        # Enter in the directory tree
        

        # Download data for every active float
        for l in lines_active_floats:
            # Update the xml with the current status of the float
            f = A[l]['wmo']
            floatname = A[l]['nome_fs'].replace(' ','')
            if not self.is_a_lov_float(f, floatname): continue

            wmo_in_xml = 'wmo_' + str(f)
            f_in_xml = root.findall(wmo_in_xml)
            if len(f_in_xml) == 0:
                f_node = xml_tree.SubElement(root, wmo_in_xml)
            else:
                f_node = [fn for fn in f_in_xml if fn.tag==wmo_in_xml][0]
            f_node.set('status', 'A')

            try:
                urlfilelist = http_url + floatname +  "/liste_all"
                print(urlfilelist)
                response = urllib2.urlopen(urlfilelist)
            except:
                log.info('No directory associated with file ' + str(f) +
                         '. This file will be skipped!')
                continue

            remotepathlist = response.read().rsplit("\n")[:-1]
            filelist=[os.path.basename(fn) for fn in remotepathlist]
            # Now I look for the profiles dir. This is the folder
            # where all the data are stored


            if len(filelist) > 0:
                download_for_f = []
                # Copy all file in a local dir with the same name
                # skipping the one that we already have
                float_local_dir = join(path, f)
                ensure_dir(float_local_dir, log, expected = False)
                for ff in filelist:
                    url = http_url + floatname + "/" + ff
                    d = download_file(url, ff, float_local_dir,
                                      log, None, True)
                    # If the file was downloaded without any problem,
                    # add it to the list of downloaded files
                    if d:
                        downloaded.append(ff)
                        download_for_f.append(ff)
                if len(download_for_f) == 0:
                    log.info('No updates found for float ' + str(f))                    
            else:
                log.info('No updates found for float ' + str(f))


        print ("DIED FLOATS")
        for l in lines_dead__floats:
            f = A[l]['wmo']
            floatname = A[l]['nome_fs'].replace(' ','')
            if not self.is_a_lov_float(f, floatname): continue

            to_be_updated = False
            # Update the xml with the current status of the float
            # Check if it must be updated
            f_in_xml = root.findall('wmo_' + str(f))
            if len(f_in_xml) == 0:
                # If this float is new, then add it to the archive
                # and it will be updated
                to_be_updated = True
                f_node = xml_tree.SubElement(root, 'wmo_' + str(f))
            else:
                f_node = [fn for fn in f_in_xml if fn.tag=='wmo_'+str(f)][0]
                # If I already know this float, but the last time it
                # was not dead, update it
                if f_node.get('status') != 'D':
                    to_be_updated = True
            f_node.set('status', 'D')
            
            if not to_be_updated:
                log.debug("Wmo " + str(f) + " is dead and will not be updated")
            else:
                log.debug("Wmo " + str(f) + " now is dead but was active on "
                          "the last run and will be updated anyway")


            if to_be_updated:
                try:
                    urlfilelist = http_url + floatname +  "/liste_all"
                    print(urlfilelist)
                    response = urllib2.urlopen(urlfilelist)
                except:
                    log.info('No directory associated with file ' + str(f) +
                             '. This file will be skipped!')
                    continue

                remotepathlist = response.read().rsplit("\n")[:-1]
                filelist=[os.path.basename(fn) for fn in remotepathlist]
                # Now I look for the profiles dir. This is the folder
                # where all the data are stored
                if len(filelist) > 0:
                    download_for_f = []
                    # Copy all file in a local dir with the same name
                    # skipping the one that we already have
                    float_local_dir = join(path, f)
                    ensure_dir(float_local_dir, log, expected = False)
                    for ff in filelist:
                        url = http_url + floatname + "/" + ff
                        d = download_file(url, ff, float_local_dir,
                                          log, None, True)
                        # If the file was downloaded without any problem,
                        # add it to the list of downloaded files
                        if d:
                            downloaded.append(ff)
                    if len(download_for_f) == 0:
                        log.info('No updates found for float ' + str(f))                    
                else:
                    log.info('No updates found for float ' + str(f))

        # Save the XML file
        root.set('Updated', now_as_string())
        xml_as_string = xml_tree.tostring(root)
        xml_rebuild = parseString(xml_as_string)
        pretty_xml = xml_rebuild.toprettyxml(indent='  ')
        pretty_xml_lines = pretty_xml.split('\n')
        pretty_xml = "\n".join([l for l in pretty_xml_lines if l.strip()])
        with open(xml_file, 'w') as xml_f:
            xml_f.write(pretty_xml)

        # Return the list of downloaded files
        return downloaded
Exemple #4
0
    def harvest(self, db_path, log):
        """
        Download all the files inside a remote directory of the ftp server
        whose modification date is after the modification date of the last
        file in the local dir. Do not download the files if the contain
        '-NRT-' in their name.

        Args:
            - *db_path*: the path of the directory set in the download program.
            - *log*: a logger object from the class Log to print informations on
              the standard output
               
        Returns:
            - *downloaded*: a list of all the downloaded filenames.
        """
        # In the following list I will store the name of the
        # files that will be downloaded or updated
        downloaded = []

        # Check if the directory for this harvester is present
        # in the database
        path = join(db_path,relative_path)
        ensure_dir(path, log, expected=True)

        # Open the connection with the remote archive
        connection = FTP(ftp_url)
        connection.login(user=user, passwd=password)

        # Enter in the folder "Intermediate"
        connection.cwd('Intermediate')

        # Enter in "OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060"
        connection.cwd('OCEANCOLOUR_MED_CHL_L4_NRT_OBSERVATIONS_009_060')

        # Enter in "dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v02"
        connection.cwd('dataset-oc-med-chl-modis_a-l4-chl_7km_daily-rt-v01')
        
        # List all the local files
        loc_files = [f for f in listdir(path) if f !='incomplete_download.tmp']
        
        # If there are no files, download everything
        if len(loc_files)==0:
            log.info('No local files found! Everything will be '
                     'downloaded from the remote repository!')
            _, years, _ = list_files(connection)
            for year in years:
                connection.cwd(year)
                files, _, perms = list_files(connection)
                files_to_be_downloaded = [f for f in files if not '-NRT-' in f]
                for f in files_to_be_downloaded:
                    d = download_file(connection, f, path,
                                      log, perms, False)
                    if d:
                        downloaded.append(f)
                connection.cwd('..')
        else:
            loc_files.sort()
            last_file = loc_files[-1]
            last_year = int(last_file[0:4])
            _, years, _ = list_files(connection)
            new_years = [y for y in years if int(y)>last_year]
            # Enter in the folder with the year of the last downloaded
            # file and download every file which is newer than that
            connection.cwd(str(last_year))
            files, _, perms = list_files(connection)
            files_to_be_downloaded = [f for f in files if not '-NRT-' in f]
            for f in files_to_be_downloaded:
                if f > last_file:
                    d = download_file(connection, f, path,
                                      log, perms, True, True)
                    if d:
                        downloaded.append(f)
            connection.cwd('..')
            # Now we will download what is in the folders of the years
            # after the last file
            for year in new_years:
                connection.cwd(year)
                files, _, perms = list_files(connection)
                files_to_be_downloaded = [f for f in files if not '-NRT-' in f]
                for f in files_to_be_downloaded:
                    d = download_file(connection, f, path,
                                      log, perms, True, True)
                    if d:
                        downloaded.append(f)
                connection.cwd('..')

            # Warning if we found a lot of updates or no updates at all
            if len(downloaded) == 0:
                log.info('No updates found!')
            if len(downloaded) >1 : 
                warn_message = 'Downloaded more than one file:'
                for f in downloaded:
                    warn_message += '\n   - ' + str(f)
                log.info(warn_message, split_lines=False)

        connection.quit()
        return downloaded
    def harvest(self, db_path, log):
        """
        Download all the files inside the remote directories "vessel" and
        "mooring" of the remote ftp server whose modification date is after
        the modification date of the last file in the local dir. Please do
        not put any file in the local directory because this may change the
        date of the last edited file

        Args:
            - *db_path*: the path of the directory set in the download program.
            - *log*: a logger object from the class Log to print informations on
              the standard output
               
        Returns:
            - *downloaded*: a list of all the downloaded filenames.
        """

        # In the following list I will store the name of the
        # files that will be downloaded or updated
        downloaded = []

        # Check if the directory for this harvester is present
        # in the database
        path = join(db_path,relative_path)
        ensure_dir(path, log, expected=True)
        # Check if exists the folder "vessel"
        path_vessel = join(path, "vessel")
        ensure_dir(path_vessel, log, expected=True)
        # Check if exists the folder "mooring"
        path_mooring = join(path, "mooring")
        ensure_dir(path_mooring, log, expected=True)

        # Open the connection with the remote archive
        connection = FTP(ftp_url)
        connection.login(user=user, passwd=password)

        # Enter in the folders
        connection.cwd('Core')
        connection.cwd('INSITU_MED_NRT_OBSERVATIONS_013_035')
        connection.cwd('monthly')
        
        # Now I will download everything from the vessel dir
        connection.cwd('vessel')
        log.debug("Entering in dir vessel")

        # Check the last file we have already downloaded
        already_downloaded = listdir(path_vessel)
        file_dates = [int(l.split('_')[1]) for l in already_downloaded]
        if len(file_dates) == 0:
            last_downloaded = 0
        else:
            last_downloaded = max(file_dates)
        log.debug("Last downloaded file on ??/{0:0>2}/{1:0>4}".format(
                   last_downloaded%100, last_downloaded//100))
                       
        # List all the dirs and take only the one that are generated
        # after the last file downloaded
        _, subdirs, _ = list_files(connection)
        subdirs_to_check = [d for d in subdirs if int(d) >= last_downloaded]

        # Download all the file in that dirs
        for d in sorted(subdirs_to_check):
            log.debug("Entering in dir vessel/" + d)
            connection.cwd(d)
            files, _, perms = list_files(connection)
            for f in files:
                if f[:2] == "MO" and f[-3:]==".nc":
                    d = download_file(connection, f, path_vessel,
                                      log, perms, True, False)
                    if d:
                        downloaded.append(f)
            connection.cwd('..')
        connection.cwd('..')

        # Now the same for the mooring dir
        connection.cwd('mooring')
        log.debug("Entering in dir mooring")

        already_downloaded = listdir(path_mooring)
        file_dates = [int(l.split('_')[1]) for l in already_downloaded if l!='incomplete_download.tmp']
        if len(file_dates) == 0:
            last_downloaded = 0
        else:
            last_downloaded = max(file_dates)
        log.debug("Last downloaded file on ??/{0:0>2}/{1:0>4}".format(
                   last_downloaded%100, last_downloaded//100))
        
        _, subdirs, _ = list_files(connection)
        subdirs_to_check = [d for d in subdirs if int(d) >= last_downloaded]

        for d in sorted(subdirs_to_check):
            log.debug("Entering in dir mooring/" + d)
            connection.cwd(d)
            files, _, perms = list_files(connection)
            for f in files:
                if f[:2] == "MO" and f[-3:]==".nc":
                    d = download_file(connection, f, path_mooring,
                                      log, perms, True, False)
                    if d:
                        downloaded.append(f)
            connection.cwd('..')
        connection.cwd('..')
        
        # At the end, download the index
        connection.cwd('..')
        _, _, perms = list_files(connection)
        download_file(connection, 'index_monthly.txt', path,
                      log, perms, False)

        connection.quit()
        return downloaded
    def rebuild(self, db_path, log):
        """
        Download all the files inside the remote directories "vessel" and
        "mooring" of the remote ftp server. If a file already exists, it
        will be rewritten.

        Args:
            - *db_path*: the path of the directory set in the download program.
            - *log*: a logger object from the class Log to print informations on
              the standard output
               
        Returns:
            - *downloaded*: a list of all the downloaded filenames.
        """

        # In the following list I will store the name of the
        # files that will be downloaded or updated
        downloaded = []

        # Check if the directory for this harvester is present
        # in the database
        path = join(db_path,relative_path)
        ensure_dir(path, log, expected=False)
        # Check if exists the folder "vessel"
        path_vessel = join(path, "vessel")
        ensure_dir(path_vessel, log, expected=False)
        # Check if exists the folder "mooring"
        path_mooring = join(path, "mooring")
        ensure_dir(path_mooring, log, expected=False)


        # Open the connection with the remote archive
        connection = FTP(ftp_url)
        connection.login(user=user, passwd=password)

        connection.cwd('Core')
        connection.cwd('INSITU_MED_NRT_OBSERVATIONS_013_035')
        connection.cwd('monthly')
        
        # Enter in the folder "vessel"
        connection.cwd('vessel')
        log.debug("Entering in dir vessel")

        # For every subdir, download every netcdf file whose
        # name starts with "MO" and put it in the vessel
        _, subdirs, _ = list_files(connection)
        for d in sorted(subdirs):
            log.debug("Entering in dir vessel/" + d)
            connection.cwd(d)
            files, _, perms = list_files(connection)
            for f in files:
                if f[:2] == "MO" and f[-3:]==".nc":
                    d = download_file(connection, f, path_vessel,
                                      log, perms, False)
                    if d:
                        downloaded.append(f)
            connection.cwd('..')
        connection.cwd('..')
        
        # The same for the other dir
        connection.cwd('mooring')
        log.debug("Entering in dir mooring")

        _, subdirs, _ = list_files(connection)
        for d in sorted(subdirs):
            log.debug("Entering in dir mooring/" + d)
            connection.cwd(d)
            files, _, perms = list_files(connection)
            for f in files:
                if f[:2] == "MO" and f[-3:]==".nc":
                    d = download_file(connection, f, path_mooring,
                                      log, perms, False)
                    if d:
                        downloaded.append(f)
            connection.cwd('..')
        connection.cwd('..')

        # At the end, download the index
        connection.cwd('..')
        _, _, perms = list_files(connection)
        download_file(connection, 'index_monthly.txt', path,
                      log, perms, False)

        connection.quit()
        return downloaded