def archive_data(self, filepath, done=True): if done: archive_location = self.archive_location archive_label = 'Archive' else: archive_location = self.fail_location archive_label = 'Fail' if archive_location is None: logger.error('{} Location "{}" is not specified'.format( archive_label, archive_location)) raise LocationNotExist('{} Location "{}" is not specified'.format( archive_label, archive_location)) source_path = os.path.join(self.source_location, filepath) archive_path = os.path.join(archive_location, filepath) archive_base = os.path.dirname(archive_path) if not os.path.exists(archive_base): logger.info('{} directory {} does not exist, creating it.'.format( archive_label, archive_base)) os.makedirs(archive_base) logger.info('Archiving file to {}'.format(archive_path)) if os.path.exists(source_path): os.rename(source_path, archive_path)
def __init__(self, config): super().__init__('mongo', config) ds_config = config[self.ds_config_section] self.mongo_server = ds_config["MongoServer"] # The port must be an integer, so convert it to int self.mongo_port = int(ds_config["MongoPort"]) self.mongo_database = ds_config["MongoDatabase"] self.mongo_collection = ds_config["MongoCollection"] if not self.mongo_database or not self.mongo_collection: logger.error("Mongo database and/or collection not defined in configuration") raise Exception("Mongo database and/or collection not defined in configuration") self.mongo_query = literal_eval(ds_config["MongoQuery"]) logger.debug("Mongo Query: %s", self.mongo_query) # # We don't want to default loading all data from a collection, # # so let's require a query for now (user can still specify a query all # # if they want to) # if not self.mongo_query: # logger.error("Failed to load MongoQuery from configuration") # raise Exception("Failed to load MongoQuery from configuration") self.mongo_username = ds_config["MongoUsername"] self.mongo_password = ds_config["MongoPassword"] if not self.mongo_username or not self.mongo_password: logger.warn("Username and/or password not set, not using authentication") self.mongo_client = MongoClient(self.mongo_server, self.mongo_port)
def read_data_to_df(self, datafile, dtype=None): try: logger.info('Reading file {}'.format(datafile)) dfile = os.path.join(self.source_location, datafile) df = self.file_parser(dfile, dtype=dtype) df = DsFileBase.add_filename_to_df(df, datafile) return df except Exception as e: logger.error('Failed to read file "{}" due to error {}'.format( datafile, e)) raise SourceDataError('Failed to read file "{}"'.format(datafile))
def read_data_to_df(self, datafile, dtype=None): try: logger.info('Reading file {}'.format(datafile)) dfile = os.path.join(self.source_location, datafile) df = pd.read_csv(dfile, dtype=dtype, sep=self.column_separator, decimal=self.decimal_point, quotechar="'", compression=self.compression) df = DsFileBase.add_filename_to_df(df, datafile) return df except Exception as e: logger.error('Failed to read file "{}" due to error {}'.format( datafile, e)) raise SourceDataError('Failed to read file "{}"'.format(datafile))
def read_data_to_df(self, datafile, dtype=None): try: logger.info('Reading file {}'.format(datafile)) dfile = os.path.join(self.source_location, datafile) dfs = pd.read_excel(dfile, dtype=dtype, sheet_name=self.sheets) if isinstance(dfs, OrderedDict): mdfs = {} for _, (name, sheet_df) in enumerate(dfs.items()): sheet_df = DsFileBase.add_filename_to_df( sheet_df, datafile) sheet_df['sheetname'] = name mdfs[name] = sheet_df else: mdfs = DsFileBase.add_filename_to_df(dfs, datafile) return dfs except Exception as e: logger.error('Failed to read file "{}" due to error {}'.format( datafile, e)) raise SourceDataError('Failed to read file "{}"'.format(datafile))
def read_data_to_df(self, tablename, dtype=None): logger.info('Reading table {}'.format(tablename)) if self.select_columns: columns = self.select_columns[:] if self.watermark_column and self.watermark_column not in columns: columns.append(self.watermark_column) query = 'select {c} from {t}'.format(c=','.join(columns), t=tablename) else: query = 'select * from {t}'.format(t=tablename) if self.watermark_column: wm = self.watermarks.get(tablename, None) if wm is not None: if pd.api.types.is_number(wm): sql = "{q} where {c} > {w} order by {c} asc".format( q=query, c=self.watermark_column, w=wm) else: sql = "{q} where {c} > '{w}' order by {c} asc".format( q=query, c=self.watermark_column, w=wm) else: sql = '{q} order by {c} asc'.format(q=query, c=self.watermark_column) else: sql = query try: df = self.run_query(sql) if len(df.index) > 0: if self.watermark_column: self.watermarks[tablename] = df.iloc[-1][ self.watermark_column] except Exception as e: logger.error('Failed to read table "{}" due to error {}'.format( tablename, e)) raise SourceDataError( 'Failed to read table "{}"'.format(tablename)) return df
def read_data_to_df(self, src_md_id, dtype=None): logger.info('Reading metadata {}'.format(src_md_id)) if src_md_id not in self.source_metadata: logger.error('Metadata not found for id "{}"'.format(src_md_id)) raise SourceDataError( 'Metadata not found for id "{}"'.format(src_md_id)) src_metadata = self.source_metadata[src_md_id] if src_metadata['status'] != 'READY': return # Normalise the type first, in case of mistype, e.g. mixed cases. ds_config_section = 'Ds' + src_metadata['type'].lower().title( ).replace('.', '') if self.datasources.get(ds_config_section, None) is None: ds_cls = getattr(sys.modules[__name__], ds_config_section, None) if ds_cls is None: logger.error('Unrecognised datasource type "{}".'.format( src_metadata['type'])) raise SourceDataError( 'Unrecognised datasource type "{}".'.format( src_metadata['type'])) self.datasources[ds_config_section] = ds_cls(self.config) source_name = src_metadata.get('source_name', None) if source_name is None: logger.error( 'Metadata "{}" is does not contain a source_name'.format( src_metadata['id'])) raise SourceDataError( 'Metadata "{}" is does not contain a source_name'.format( src_metadata['id'])) self.update_metadata_status(src_md_id, 'PROCESSING') df = self.datasources[ds_config_section].read_data_to_df(source_name, dtype=dtype) if self.metadata_proc is not None: try: df = self.metadata_proc(df, src_metadata) except Exception as e: logger.error( 'Failed to execute metadata processor "{}" due to error {}' .format(self.metadata_proc_name, e)) raise ExectionError( 'Failed to execute metadata processor "{}"'.format( self.metadata_proc_name)) return df