Esempio n. 1
0
    def rebuild(self, db_path, log, skip_if_present=False):
        """
        For every float in the file wmo, download every data file related to
        that float that starts with 'MR'. Then create a xml file with the
        data read from the wmo file.

        Args:
            - *db_path*: the path of the directory set in the download program.
            - *log*: a logger object from the class Log to print informations on
              the standard output
            - *skip_if_present*: A boolean value that set if not download again 
              the files that are saved on the local directory. By defalut is False
               
        Returns:
            - *downloaded*: a list of all the downloaded filenames.
        """
        # In the following list I will store the name of the
        # files that will be downloaded or updated
        print("REBUILD")
        downloaded = []

        # Read the wmo file line by line (exclude the first one because
        # it does not contain data)
        A=self.wmo_file_reader()


        # Delete, if present, the XML files with all the floats
        xml_file = join(xml_path, self.__class__.__name__ + '.xml')
        if exists(xml_file):
            remove(xml_file)
        # and create a new one (in memory)
        root = xml_tree.Element("BioFloats")
        root.set('Updated', now_as_string())
        tree = xml_tree.ElementTree(root)

        # Check if the directory for this harvester is present
        # in the database
        path = join(db_path,relative_path)
        ensure_dir(path, log, expected=False)

        # Open the connection with the remote archive

        # Download data for every active float
        for l in range(len(A)):
            f = A[l]['wmo']
            floatname = A[l]['nome_fs'].replace(' ','')
            if not self.is_a_lov_float(f, floatname): continue


            # Update the xml with the current status of the float
            f_in_xml = root.findall('wmo_' + str(f))
            if len(f_in_xml) == 0:
                f_node = xml_tree.SubElement(root, 'wmo_' + str(f))

            else:
                f_node = [fn for fn in f_in_xml if fn.tag=='wmo_'+str(f)][0]
            f_node.set('status', A[l]['status'])

            try:
                urlfilelist = http_url + floatname +  "/liste_all"
                print(urlfilelist)
                response = urllib2.urlopen(urlfilelist)
            except:
                log.info('Cannot download file ' + urlfilelist +
                         '. This file will be skipped!')
                continue

            remotepathlist = response.read().rsplit("\n")[:-1]
            filelist=[os.path.basename(fn) for fn in remotepathlist]
            # Now I look for the profiles dir. This is the folder
            # where all the data are stored
            if len(filelist) > 0:
                download_for_f = []
                # Copy all file in a local dir with the same name
                # skipping the one that we already have
                float_local_dir = join(path, f)

                print(float_local_dir)
                ensure_dir(float_local_dir, log, expected = False)
                for ff in filelist:
                    url = http_url + floatname + "/" + ff
                    d = download_file(url, ff, float_local_dir,
                                      log, None, True)
                    # If the file was downloaded without any problem,
                    # add it to the list of downloaded files
                    if d:
                        downloaded.append(ff)
                        download_for_f.append(ff)
                if len(download_for_f) == 0:
                    log.info('No updates found for float ' + str(f))                    
            else:
                log.info('No updates found for float ' + str(f))



        
        # Save the XML file
        xml_as_string = xml_tree.tostring(root)
        xml_rebuild = parseString(xml_as_string)
        pretty_xml = xml_rebuild.toprettyxml(indent='  ')
        pretty_xml_lines = pretty_xml.split('\n')
        pretty_xml = "\n".join([l for l in pretty_xml_lines if l.strip()])

        ensure_dir(xml_path, log, expected=False)
        with open(xml_file, 'w') as xml_f:
            xml_f.write(pretty_xml)

        # Return the list of downloaded files
        return downloaded
Esempio n. 2
0
    def harvest(self, db_path, log):
        """
        For every float in the file wmo, check the status in the wmo_file and
        in the xml one. If in at least one file the float is reported as active,
        then check the last file downloaded for that wmo and download every file
        on the server that is more recent than the one already downloaded. Then
        update the xml file with the status reported in the wmo file.

        Args:
            - *db_path*: the path of the directory set in the download program.
            - *log*: a logger object from the class Log to print informations on
              the standard output
               
        Returns:
            - *downloaded*: a list of all the downloaded filenames.
        """
        # In the following list I will store the name of the
        # files that will be downloaded or updated
        print("HARVEST")
        downloaded = []

        # Read the wmo file

        A = self.wmo_file_reader()
        lines_active_floats=np.where(A['status']=='A')[0]
        lines_dead__floats =np.where(A['status']=='D')[0]


        # Now we need the xml file that keeps what we did on the
        # last updates
        xml_file = join(xml_path, self.__class__.__name__ + '.xml')
        try:
            tree = xml_tree.parse(xml_file)
        except:
            log.info('XML file not found or not readable. '
                     'This script will update every file '
                     'from the remote archive. This is '
                     'almost the same than run in reset '
                     'mode, but the files that exist will '
                     'not be downloaded again. Moreover, '
                     'the XML file will be rewritten.')
            return self.rebuild(db_path, log, skip_if_present=True)

        root = tree.getroot()

        # Check if the directory for this harvester is present
        # in the database
        path = join(db_path,relative_path)
        ensure_dir(path, log, expected=True)

        # Open the connection with the remote archive

        
        # Enter in the directory tree
        

        # Download data for every active float
        for l in lines_active_floats:
            # Update the xml with the current status of the float
            f = A[l]['wmo']
            floatname = A[l]['nome_fs'].replace(' ','')
            if not self.is_a_lov_float(f, floatname): continue

            wmo_in_xml = 'wmo_' + str(f)
            f_in_xml = root.findall(wmo_in_xml)
            if len(f_in_xml) == 0:
                f_node = xml_tree.SubElement(root, wmo_in_xml)
            else:
                f_node = [fn for fn in f_in_xml if fn.tag==wmo_in_xml][0]
            f_node.set('status', 'A')

            try:
                urlfilelist = http_url + floatname +  "/liste_all"
                print(urlfilelist)
                response = urllib2.urlopen(urlfilelist)
            except:
                log.info('No directory associated with file ' + str(f) +
                         '. This file will be skipped!')
                continue

            remotepathlist = response.read().rsplit("\n")[:-1]
            filelist=[os.path.basename(fn) for fn in remotepathlist]
            # Now I look for the profiles dir. This is the folder
            # where all the data are stored


            if len(filelist) > 0:
                download_for_f = []
                # Copy all file in a local dir with the same name
                # skipping the one that we already have
                float_local_dir = join(path, f)
                ensure_dir(float_local_dir, log, expected = False)
                for ff in filelist:
                    url = http_url + floatname + "/" + ff
                    d = download_file(url, ff, float_local_dir,
                                      log, None, True)
                    # If the file was downloaded without any problem,
                    # add it to the list of downloaded files
                    if d:
                        downloaded.append(ff)
                        download_for_f.append(ff)
                if len(download_for_f) == 0:
                    log.info('No updates found for float ' + str(f))                    
            else:
                log.info('No updates found for float ' + str(f))


        print ("DIED FLOATS")
        for l in lines_dead__floats:
            f = A[l]['wmo']
            floatname = A[l]['nome_fs'].replace(' ','')
            if not self.is_a_lov_float(f, floatname): continue

            to_be_updated = False
            # Update the xml with the current status of the float
            # Check if it must be updated
            f_in_xml = root.findall('wmo_' + str(f))
            if len(f_in_xml) == 0:
                # If this float is new, then add it to the archive
                # and it will be updated
                to_be_updated = True
                f_node = xml_tree.SubElement(root, 'wmo_' + str(f))
            else:
                f_node = [fn for fn in f_in_xml if fn.tag=='wmo_'+str(f)][0]
                # If I already know this float, but the last time it
                # was not dead, update it
                if f_node.get('status') != 'D':
                    to_be_updated = True
            f_node.set('status', 'D')
            
            if not to_be_updated:
                log.debug("Wmo " + str(f) + " is dead and will not be updated")
            else:
                log.debug("Wmo " + str(f) + " now is dead but was active on "
                          "the last run and will be updated anyway")


            if to_be_updated:
                try:
                    urlfilelist = http_url + floatname +  "/liste_all"
                    print(urlfilelist)
                    response = urllib2.urlopen(urlfilelist)
                except:
                    log.info('No directory associated with file ' + str(f) +
                             '. This file will be skipped!')
                    continue

                remotepathlist = response.read().rsplit("\n")[:-1]
                filelist=[os.path.basename(fn) for fn in remotepathlist]
                # Now I look for the profiles dir. This is the folder
                # where all the data are stored
                if len(filelist) > 0:
                    download_for_f = []
                    # Copy all file in a local dir with the same name
                    # skipping the one that we already have
                    float_local_dir = join(path, f)
                    ensure_dir(float_local_dir, log, expected = False)
                    for ff in filelist:
                        url = http_url + floatname + "/" + ff
                        d = download_file(url, ff, float_local_dir,
                                          log, None, True)
                        # If the file was downloaded without any problem,
                        # add it to the list of downloaded files
                        if d:
                            downloaded.append(ff)
                    if len(download_for_f) == 0:
                        log.info('No updates found for float ' + str(f))                    
                else:
                    log.info('No updates found for float ' + str(f))

        # Save the XML file
        root.set('Updated', now_as_string())
        xml_as_string = xml_tree.tostring(root)
        xml_rebuild = parseString(xml_as_string)
        pretty_xml = xml_rebuild.toprettyxml(indent='  ')
        pretty_xml_lines = pretty_xml.split('\n')
        pretty_xml = "\n".join([l for l in pretty_xml_lines if l.strip()])
        with open(xml_file, 'w') as xml_f:
            xml_f.write(pretty_xml)

        # Return the list of downloaded files
        return downloaded