def get_dir_contents(url): """ This method will return the contents of the remote directory, based on the URL path passed. The method will raise the HTTPError exception if there is an issue reading the URL passed in. :param url: The URL of the directory for which the contents are returned. :return: dir_contents: The contents of the remote URL directory. """ try: url_io = urllib2.urlopen(url) dir_contents = url_io.read().decode('utf-8') except urllib2.HTTPError as ex: log.warning('URL does not exist: %s: %s', url, ex) raise return dir_contents
def get_dir_contents(path, reverse_order=False): """ This method will return the contents of the remote directory, based on the path passed. The method will raise the OSError exception if there is an issue reading the path passed in. :param reverse_order: If true the contents are sorted in reverse order. :param path: The path of the directory for which the contents are returned. :return: dir_contents: The contents of the remote directory. """ try: dir_contents = os.listdir(path) dir_contents.sort(reverse=reverse_order) except OSError as ex: log.warning('Path does not exist: %s', ex) raise return dir_contents
def generate_zplsc_echograms(self): """ This method will get the subsites, deployments and dates from the command line or all of the subsites, deployments and dates for the daily process. It will generate the echograms based on those inputs and upload the echograms to the raw data server. :return: """ # If we are creating a 1-hour echogram, generate the echogram. if self.zplsc_datafile is not None: # Send the 1-hour raw data file to the zplsc C Series parser to generate the echogram. with open(self.zplsc_datafile) as file_handle: base_directory = os.path.expanduser(self.base_echogram_directory) path_structure, filename = os.path.split(self.zplsc_datafile) zplsc_echogram_file_path = None for subsite in self.zplsc_subsites: subsite_index = path_structure.find(subsite) if subsite_index >= 0: zplsc_echogram_file_path = os.path.join(base_directory, path_structure[subsite_index:]) # Create the ZPLSC Echogram directory structure if it doesn't exist. try: os.makedirs(zplsc_echogram_file_path) except OSError as ex: if ex.errno == errno.EEXIST and os.path.isdir(zplsc_echogram_file_path): pass else: log.error('Error creating local ZPLSC Echogram storage directory: %s', ex.message) raise break if zplsc_echogram_file_path is not None: # Get the parser for this file and generate the echogram. parser = ZplscCParser(CONFIG, file_handle, self.rec_exception_callback) parser.create_echogram(zplsc_echogram_file_path) else: log.warning('The subsite is not one of the subsites containing a ZPLSC-C instrument.') else: # We are creating 24-hour echograms ... # Create the temporary data file directory. self.temp_directory = os.path.join(os.path.expanduser(USER_HOME), TEMP_DIR) if not os.path.exists(self.temp_directory): os.mkdir(self.temp_directory) # Create the echograms for the zplsc instruments of each subsite. for subsite in self.subsites: zplsc_24_subsite_prefix = subsite + '-' try: deployments = self.get_deployment_dirs(subsite) except OSError: continue for deployment in deployments: zplsc_24_deployment_prefix = zplsc_24_subsite_prefix + 'R' + str(deployment) + '-' try: echogram_dates, date_dirs_path = self.get_date_dirs(subsite, deployment) except OSError: continue for date_dir, entire_month in echogram_dates.items(): self.zplsc_24_datafile_prefix = zplsc_24_deployment_prefix + 'sn' + self.serial_num + '-' if entire_month: number_of_days_in_the_month = calendar.monthrange(date_dir.year, date_dir.month)[1] for day in range(number_of_days_in_the_month): echogram_date = date_dir + timedelta(days=day) # Aggregate the 24 raw data files for the given instrument to 1 24-hour data file. zplsc_24_datafile, zplsc_echogram_file_path = self.aggregate_raw_data(date_dirs_path, echogram_date) if not zplsc_24_datafile: log.warning('Unable to aggregate raw data files for %s under %s', echogram_date, date_dirs_path) continue # Send the 24-hour raw data file to the zplsc C Series parser to generate the echogram. with open(zplsc_24_datafile) as file_handle: parser = ZplscCParser(CONFIG, file_handle, self.rec_exception_callback) parser.create_echogram(zplsc_echogram_file_path) if not self.keep_temp_files: self.purge_temporary_files() else: # Aggregate the 24 raw data files for the given instrument to 1 24-hour data file. zplsc_24_datafile, zplsc_echogram_file_path = self.aggregate_raw_data(date_dirs_path, date_dir) if not zplsc_24_datafile: log.warning('Unable to aggregate raw data files for %s under %s', date_dir, date_dirs_path) continue # Send the 24-hour raw data file to the zplsc C Series parser to generate the echogram. with open(zplsc_24_datafile) as file_handle: parser = ZplscCParser(CONFIG, file_handle, self.rec_exception_callback) parser.create_echogram(zplsc_echogram_file_path) if not self.keep_temp_files: self.purge_temporary_files() # Remove the temporary data file directory and its content. if not self.keep_temp_files: shutil.rmtree(self.temp_directory) # If it's running as a daily process, wait 24 hours and re-run this method if self.process_mode: threading.Timer(SECONDS_IN_DAY, self.generate_zplsc_echograms).start()
def get_date_dirs(self, subsite, deployment): """ This method will generate the path to the directory of date directories in the format of YYYYMM. Exceptions raised by this method: OSError ValueError :param subsite: The subsite of the ZPLSC instrument. :param deployment: The deployment number of the data of interest. :return: echogram_dates: The mapping of echogram dates to the entire month flag date_dirs_path: The path to the date directories. """ # Generate the portion of the path up to the DCL directory to get the all the instrument sub-directories. deployment_dir = os.path.join(self.raw_data_dir, subsite.upper(), 'R%05d' % deployment) dcl_path = '' instrument_dirs = '' for dcl_rel_path in DCL_PATHS: dcl_path = os.path.join(deployment_dir, dcl_rel_path) try: instrument_dirs = self.get_dir_contents(dcl_path, True) break except OSError: log.info('Could not find path: %s: checking alternate path', dcl_path) if dcl_path is DCL_PATHS[-1]: raise # Generate the portion of the path up to the ZPLSC Instrument serial number. serial_num_found = None for instrument in instrument_dirs: serial_num_found = SERIAL_NUM_DIR_MATCHER.match(instrument) if serial_num_found: break if serial_num_found is None: log.warning('Could not find ZPLSC data for subsite: %s and recovered deployment: %s', subsite, deployment) raise OSError self.serial_num = serial_num_found.group(1) serial_num_dir = os.path.join(dcl_path, serial_num_found.group()) sub_dirs = self.get_dir_contents(serial_num_dir) # Generate the portion of the path that contains the recovered data path. recovered_path = RECOVERED_DIR % (subsite.lower(), self.serial_num) recovered_dir = '' for sub_dir in sub_dirs: if sub_dir.startswith(recovered_path): recovered_dir = sub_dir break if recovered_dir: # Create the raw data path including the recovered path date_dirs_path = os.path.join(serial_num_dir, recovered_dir, DATA_PATH) else: log.warning('Could not find ZPLSC recovered data path starting with: %s', recovered_path) raise OSError # If no dates were entered on the command line, get the entire list of date directories. echogram_dates = self.echogram_dates if not echogram_dates: echogram_dates = {} # Get all the year/month date subdirectories for this subsite/deployment the get contents of the directory. date_dirs = self.get_dir_contents(date_dirs_path, True) date_dirs = [(date_dir[:4], date_dir[4:]) for date_dir in date_dirs] # If in process mode, get the latest date that has 24 1-hour data files for echogram generation. if self.process_mode: echogram_dates[self.get_latest_echogram_date(date_dirs_path, date_dirs)] = False # Otherwise, get all the year/month date subdirectories for this subsite and deployment. else: for date_dir in date_dirs: year = int(date_dir[0]) month = int(date_dir[1]) # Save the date and indicate that the entire month should be generated. echogram_dates[date(year, month, 1)] = True return echogram_dates, date_dirs_path
def generate_zplsc_echograms(self): """ This method will get the subsites, deployments and dates from the command line or all of the subsites, deployments and dates for the daily process. It will generate the echograms based on those inputs and upload the echograms to the raw data server. :return: """ # If we are creating a 1-hour echogram, generate the echogram. if self.zplsc_datafile is not None: # Send the 1-hour raw data file to the zplsc C Series parser to generate the echogram. with open(self.zplsc_datafile) as file_handle: base_directory = os.path.expanduser( self.base_echogram_directory) path_structure, filename = os.path.split(self.zplsc_datafile) zplsc_echogram_file_path = None for subsite in self.zplsc_subsites: subsite_index = path_structure.find(subsite) if subsite_index >= 0: zplsc_echogram_file_path = os.path.join( base_directory, path_structure[subsite_index:]) # Create the ZPLSC Echogram directory structure if it doesn't exist. try: os.makedirs(zplsc_echogram_file_path) except OSError as ex: if ex.errno == errno.EEXIST and os.path.isdir( zplsc_echogram_file_path): pass else: log.error( 'Error creating local ZPLSC Echogram storage directory: %s', ex.message) raise break if zplsc_echogram_file_path is not None: # Get the parser for this file and generate the echogram. parser = ZplscCParser(CONFIG, file_handle, self.rec_exception_callback) parser.create_echogram(zplsc_echogram_file_path) else: log.warning( 'The subsite is not one of the subsites containing a ZPLSC-C instrument.' ) else: # We are creating 24-hour echograms ... # Create the temporary data file directory. self.temp_directory = os.path.join(os.path.expanduser(USER_HOME), TEMP_DIR) if not os.path.exists(self.temp_directory): os.mkdir(self.temp_directory) # Create the echograms for the zplsc instruments of each subsite. for subsite in self.subsites: zplsc_24_subsite_prefix = subsite + '-' try: deployments = self.get_deployment_dirs(subsite) except OSError: continue for deployment in deployments: zplsc_24_deployment_prefix = zplsc_24_subsite_prefix + 'R' + str( deployment) + '-' try: echogram_dates, date_dirs_path = self.get_date_dirs( subsite, deployment) except OSError: continue for date_dir, entire_month in echogram_dates.items(): self.zplsc_24_datafile_prefix = zplsc_24_deployment_prefix + 'sn' + self.serial_num + '-' if entire_month: number_of_days_in_the_month = calendar.monthrange( date_dir.year, date_dir.month)[1] for day in range(number_of_days_in_the_month): echogram_date = date_dir + timedelta(days=day) # Aggregate the 24 raw data files for the given instrument to 1 24-hour data file. zplsc_24_datafile, zplsc_echogram_file_path = self.aggregate_raw_data( date_dirs_path, echogram_date) if not zplsc_24_datafile: log.warning( 'Unable to aggregate raw data files for %s under %s', echogram_date, date_dirs_path) continue # Send the 24-hour raw data file to the zplsc C Series parser to generate the echogram. with open(zplsc_24_datafile) as file_handle: parser = ZplscCParser( CONFIG, file_handle, self.rec_exception_callback) parser.create_echogram( zplsc_echogram_file_path) if not self.keep_temp_files: self.purge_temporary_files() else: # Aggregate the 24 raw data files for the given instrument to 1 24-hour data file. zplsc_24_datafile, zplsc_echogram_file_path = self.aggregate_raw_data( date_dirs_path, date_dir) if not zplsc_24_datafile: log.warning( 'Unable to aggregate raw data files for %s under %s', date_dir, date_dirs_path) continue # Send the 24-hour raw data file to the zplsc C Series parser to generate the echogram. with open(zplsc_24_datafile) as file_handle: parser = ZplscCParser( CONFIG, file_handle, self.rec_exception_callback) parser.create_echogram( zplsc_echogram_file_path) if not self.keep_temp_files: self.purge_temporary_files() # Remove the temporary data file directory and its content. if not self.keep_temp_files: shutil.rmtree(self.temp_directory) # If it's running as a daily process, wait 24 hours and re-run this method if self.process_mode: threading.Timer(SECONDS_IN_DAY, self.generate_zplsc_echograms).start()
def get_date_dirs(self, subsite, deployment): """ This method will generate the path to the directory of date directories in the format of YYYYMM. Exceptions raised by this method: OSError ValueError :param subsite: The subsite of the ZPLSC instrument. :param deployment: The deployment number of the data of interest. :return: echogram_dates: The mapping of echogram dates to the entire month flag date_dirs_path: The path to the date directories. """ # Generate the portion of the path up to the DCL directory to get the all the instrument sub-directories. deployment_dir = os.path.join(self.raw_data_dir, subsite.upper(), 'R%05d' % deployment) dcl_path = '' instrument_dirs = '' for dcl_rel_path in DCL_PATHS: dcl_path = os.path.join(deployment_dir, dcl_rel_path) try: instrument_dirs = self.get_dir_contents(dcl_path, True) break except OSError: log.info('Could not find path: %s: checking alternate path', dcl_path) if dcl_path is DCL_PATHS[-1]: raise # Generate the portion of the path up to the ZPLSC Instrument serial number. serial_num_found = None for instrument in instrument_dirs: serial_num_found = SERIAL_NUM_DIR_MATCHER.match(instrument) if serial_num_found: break if serial_num_found is None: log.warning( 'Could not find ZPLSC data for subsite: %s and recovered deployment: %s', subsite, deployment) raise OSError self.serial_num = serial_num_found.group(1) serial_num_dir = os.path.join(dcl_path, serial_num_found.group()) sub_dirs = self.get_dir_contents(serial_num_dir) # Generate the portion of the path that contains the recovered data path. recovered_path = RECOVERED_DIR % (subsite.lower(), self.serial_num) recovered_dir = '' for sub_dir in sub_dirs: if sub_dir.startswith(recovered_path): recovered_dir = sub_dir break if recovered_dir: # Create the raw data path including the recovered path date_dirs_path = os.path.join(serial_num_dir, recovered_dir, DATA_PATH) else: log.warning( 'Could not find ZPLSC recovered data path starting with: %s', recovered_path) raise OSError # If no dates were entered on the command line, get the entire list of date directories. echogram_dates = self.echogram_dates if not echogram_dates: echogram_dates = {} # Get all the year/month date subdirectories for this subsite/deployment the get contents of the directory. date_dirs = self.get_dir_contents(date_dirs_path, True) date_dirs = [(date_dir[:4], date_dir[4:]) for date_dir in date_dirs] # If in process mode, get the latest date that has 24 1-hour data files for echogram generation. if self.process_mode: echogram_dates[self.get_latest_echogram_date( date_dirs_path, date_dirs)] = False # Otherwise, get all the year/month date subdirectories for this subsite and deployment. else: for date_dir in date_dirs: year = int(date_dir[0]) month = int(date_dir[1]) # Save the date and indicate that the entire month should be generated. echogram_dates[date(year, month, 1)] = True return echogram_dates, date_dirs_path
def generate_zplsc_echograms(self): """ This method will get the subsites, deployments and dates from the command line or all of the subsites, deployments and dates for the daily process. It will generate the echograms based on those inputs and upload the echograms to the raw data server. :return: """ # Create the temporary data file directory. self.temp_directory = os.path.join(os.path.expanduser(USER_HOME), TEMP_DIR) if not os.path.exists(self.temp_directory): os.mkdir(self.temp_directory) # Create the echograms for the zplsc instruments of each subsite. for subsite in self.subsites: zplsc_24_subsite_prefix = subsite + '-' try: deployments = self.get_deployment_dirs(subsite) except urllib2.HTTPError: continue for deployment in deployments: zplsc_24_deployment_prefix = zplsc_24_subsite_prefix + 'R' + str(deployment) + '-' try: echogram_dates, date_dirs_url = self.get_date_dirs(subsite, deployment) except urllib2.HTTPError: continue for date_dir, entire_month in echogram_dates.items(): self.zplsc_24_datafile_prefix = zplsc_24_deployment_prefix + 'sn' + self.serial_num + '-' if entire_month: number_of_days_in_the_month = calendar.monthrange(date_dir.year, date_dir.month)[1] for day in range(number_of_days_in_the_month): echogram_date = date_dir + timedelta(days=day) # Aggregate the 24 raw data files for the given instrument to 1 24-hour data file. zplsc_24_datafile, zplsc_echogram_file_path = self.aggregate_raw_data(date_dirs_url, echogram_date) if not zplsc_24_datafile: log.warning('Unable to aggregate raw data files for: %s', echogram_date) continue # Send the 24-hour raw data file to the zplsc C Series parser to generate the echogram. with open(zplsc_24_datafile) as file_handle: parser = ZplscCParser(CONFIG, file_handle, self.rec_exception_callback) parser.create_echogram(zplsc_echogram_file_path) if not self.keep_temp_files: self.purge_temporary_files() else: # Aggregate the 24 raw data files for the given instrument to 1 24-hour data file. zplsc_24_datafile, zplsc_echogram_file_path = self.aggregate_raw_data(date_dirs_url, date_dir) if not zplsc_24_datafile: log.warning('Unable to aggregate raw data files for: %s', date_dir) continue # Send the 24-hour raw data file to the zplsc C Series parser to generate the echogram. with open(zplsc_24_datafile) as file_handle: parser = ZplscCParser(CONFIG, file_handle, self.rec_exception_callback) parser.create_echogram(zplsc_echogram_file_path) if not self.keep_temp_files: self.purge_temporary_files() # Remove the temporary data file directory and its content. if not self.keep_temp_files: shutil.rmtree(self.temp_directory) # If it's running as a daily process, wait 24 hours and re-run this method if self.process_mode: threading.Timer(SECONDS_IN_DAY, self.generate_zplsc_echograms).start()
def get_date_dirs(self, subsite, deployment): """ This method will generate the path to the directory of date directories in the format of YYYYMM. :param subsite: The subsite of the ZPLSC instrument. :param deployment: The deployment number of the data of interest. :return: echogram_dates: The mapping of echogram dates to the entire month flag date_dirs_url: The path to the date directories. """ # echogram_dates = {} # date_dirs_url = '' # Generate the portion of the URL up to the DCL directory to get the all the instrument sub-directories. deployment_url = os.path.join(RAW_DATA_URL, subsite.upper(), 'R%05d' % deployment) dcl_url = '' instrument_dirs = '' for dcl_path in DCL_PATHS: dcl_url = os.path.join(deployment_url, dcl_path) try: instrument_dirs = self.get_dir_contents(dcl_url) break except urllib2.HTTPError: log.info('Could not find path: %s: checking alternate path', dcl_path) if dcl_path is DCL_PATHS[-1]: raise # Generate the portion of the URL up to the ZPLSC Instrument serial number. serial_num_found = SERIAL_NUM_DIR_MATCHER.search(instrument_dirs) if serial_num_found is None: log.warning('Could not find ZPLSC data for subsite: %s and deployment: %s', subsite, deployment) raise ValueError self.serial_num = serial_num_found.group(2) serial_num_url = os.path.join(dcl_url, SERIAL_NUM_DIR_MATCHER.search(instrument_dirs).group(1)) sub_dirs = self.get_dir_contents(serial_num_url) # Generate the portion of the URL that contains the recovered data path. recovered_path = RECOVERED_DIR % (subsite.lower(), self.serial_num) start_idx = sub_dirs.find(recovered_path) # If this is the directory structure that has the recovered directory, add it to the URL. date_dirs_url = serial_num_url if start_idx != -1: end_idx = start_idx + len(recovered_path) + len(RECOVERED_DATE_FMT) recovered_path = os.path.join(sub_dirs[start_idx:end_idx], DATA_PATH) # Create the raw data URL with the recovered path date_dirs_url = os.path.join(serial_num_url, recovered_path) # If no dates were entered on the command line, get the entire list of date directories. echogram_dates = self.echogram_dates if not echogram_dates: # Get all the year/month date subdirectories for this subsite and deployment. date_dirs_response = self.get_dir_contents(date_dirs_url) # Generate the list of the date directories. echogram_dates = {} date_dirs_list = DATE_DIR_RE_MATCHER.findall(date_dirs_response) date_dirs_list = sorted(date_dirs_list, key=lambda x: (x[0], x[1]), reverse=True) # If in process mode, get the latest date that has 24 1-hour data files for echogram generation. if self.process_mode: echogram_dates[self.get_latest_echogram_date(date_dirs_url, date_dirs_list)] = False # Otherwise, get all the year/month date subdirectories for this subsite and deployment. else: for date_dir in date_dirs_list: year = int(date_dir[0]) month = int(date_dir[1]) # Save the date and indicate that the entire month should be generated. echogram_dates[date(year, month, 1)] = True return echogram_dates, date_dirs_url