Ejemplo n.º 1
0
    def archive_data(self, filepath, done=True):
        if done:
            archive_location = self.archive_location
            archive_label = 'Archive'
        else:
            archive_location = self.fail_location
            archive_label = 'Fail'

        if archive_location is None:
            logger.error('{} Location "{}" is not specified'.format(
                archive_label, archive_location))
            raise LocationNotExist('{} Location "{}" is not specified'.format(
                archive_label, archive_location))

        source_path = os.path.join(self.source_location, filepath)
        archive_path = os.path.join(archive_location, filepath)
        archive_base = os.path.dirname(archive_path)

        if not os.path.exists(archive_base):
            logger.info('{} directory {} does not exist, creating it.'.format(
                archive_label, archive_base))
            os.makedirs(archive_base)

        logger.info('Archiving file to {}'.format(archive_path))
        if os.path.exists(source_path):
            os.rename(source_path, archive_path)
Ejemplo n.º 2
0
    def __init__(self, config):
        super().__init__('mongo', config)

        ds_config = config[self.ds_config_section]

        self.mongo_server = ds_config["MongoServer"]
        # The port must be an integer, so convert it to int
        self.mongo_port = int(ds_config["MongoPort"])
    
        self.mongo_database = ds_config["MongoDatabase"]
        self.mongo_collection = ds_config["MongoCollection"]
        if not self.mongo_database or not self.mongo_collection:
            logger.error("Mongo database and/or collection not defined in configuration")
            raise Exception("Mongo database and/or collection not defined in configuration")

        self.mongo_query = literal_eval(ds_config["MongoQuery"])
        logger.debug("Mongo Query: %s", self.mongo_query)

        # # We don't want to default loading all data from a collection,
        # # so let's require a query for now (user can still specify a query all
        # # if they want to)
        # if not self.mongo_query:
        #     logger.error("Failed to load MongoQuery from configuration")
        #     raise Exception("Failed to load MongoQuery from configuration")

        self.mongo_username = ds_config["MongoUsername"]
        self.mongo_password = ds_config["MongoPassword"]
        if not self.mongo_username or not self.mongo_password:
            logger.warn("Username and/or password not set, not using authentication")

        self.mongo_client = MongoClient(self.mongo_server, self.mongo_port)
Ejemplo n.º 3
0
 def read_data_to_df(self, datafile, dtype=None):
     try:
         logger.info('Reading file {}'.format(datafile))
         dfile = os.path.join(self.source_location, datafile)
         df = self.file_parser(dfile, dtype=dtype)
         df = DsFileBase.add_filename_to_df(df, datafile)
         return df
     except Exception as e:
         logger.error('Failed to read file "{}" due to error {}'.format(
             datafile, e))
         raise SourceDataError('Failed to read file "{}"'.format(datafile))
Ejemplo n.º 4
0
 def read_data_to_df(self, datafile, dtype=None):
     try:
         logger.info('Reading file {}'.format(datafile))
         dfile = os.path.join(self.source_location, datafile)
         df = pd.read_csv(dfile,
                          dtype=dtype,
                          sep=self.column_separator,
                          decimal=self.decimal_point,
                          quotechar="'",
                          compression=self.compression)
         df = DsFileBase.add_filename_to_df(df, datafile)
         return df
     except Exception as e:
         logger.error('Failed to read file "{}" due to error {}'.format(
             datafile, e))
         raise SourceDataError('Failed to read file "{}"'.format(datafile))
Ejemplo n.º 5
0
 def read_data_to_df(self, datafile, dtype=None):
     try:
         logger.info('Reading file {}'.format(datafile))
         dfile = os.path.join(self.source_location, datafile)
         dfs = pd.read_excel(dfile, dtype=dtype, sheet_name=self.sheets)
         if isinstance(dfs, OrderedDict):
             mdfs = {}
             for _, (name, sheet_df) in enumerate(dfs.items()):
                 sheet_df = DsFileBase.add_filename_to_df(
                     sheet_df, datafile)
                 sheet_df['sheetname'] = name
                 mdfs[name] = sheet_df
         else:
             mdfs = DsFileBase.add_filename_to_df(dfs, datafile)
         return dfs
     except Exception as e:
         logger.error('Failed to read file "{}" due to error {}'.format(
             datafile, e))
         raise SourceDataError('Failed to read file "{}"'.format(datafile))
Ejemplo n.º 6
0
    def read_data_to_df(self, tablename, dtype=None):
        logger.info('Reading table {}'.format(tablename))

        if self.select_columns:
            columns = self.select_columns[:]
            if self.watermark_column and self.watermark_column not in columns:
                columns.append(self.watermark_column)
            query = 'select {c} from {t}'.format(c=','.join(columns),
                                                 t=tablename)
        else:
            query = 'select * from {t}'.format(t=tablename)

        if self.watermark_column:
            wm = self.watermarks.get(tablename, None)

            if wm is not None:
                if pd.api.types.is_number(wm):
                    sql = "{q} where {c} > {w} order by {c} asc".format(
                        q=query, c=self.watermark_column, w=wm)
                else:
                    sql = "{q} where {c} > '{w}' order by {c} asc".format(
                        q=query, c=self.watermark_column, w=wm)
            else:
                sql = '{q} order by {c} asc'.format(q=query,
                                                    c=self.watermark_column)
        else:
            sql = query

        try:
            df = self.run_query(sql)

            if len(df.index) > 0:
                if self.watermark_column:
                    self.watermarks[tablename] = df.iloc[-1][
                        self.watermark_column]

        except Exception as e:
            logger.error('Failed to read table "{}" due to error {}'.format(
                tablename, e))
            raise SourceDataError(
                'Failed to read table "{}"'.format(tablename))

        return df
Ejemplo n.º 7
0
    def read_data_to_df(self, src_md_id, dtype=None):
        logger.info('Reading metadata {}'.format(src_md_id))
        if src_md_id not in self.source_metadata:
            logger.error('Metadata not found for id "{}"'.format(src_md_id))
            raise SourceDataError(
                'Metadata not found for id "{}"'.format(src_md_id))

        src_metadata = self.source_metadata[src_md_id]
        if src_metadata['status'] != 'READY':
            return

        # Normalise the type first, in case of mistype, e.g. mixed cases.
        ds_config_section = 'Ds' + src_metadata['type'].lower().title(
        ).replace('.', '')

        if self.datasources.get(ds_config_section, None) is None:
            ds_cls = getattr(sys.modules[__name__], ds_config_section, None)
            if ds_cls is None:
                logger.error('Unrecognised datasource type "{}".'.format(
                    src_metadata['type']))
                raise SourceDataError(
                    'Unrecognised datasource type "{}".'.format(
                        src_metadata['type']))
            self.datasources[ds_config_section] = ds_cls(self.config)

        source_name = src_metadata.get('source_name', None)
        if source_name is None:
            logger.error(
                'Metadata "{}" is does not contain a source_name'.format(
                    src_metadata['id']))
            raise SourceDataError(
                'Metadata "{}" is does not contain a source_name'.format(
                    src_metadata['id']))
        self.update_metadata_status(src_md_id, 'PROCESSING')

        df = self.datasources[ds_config_section].read_data_to_df(source_name,
                                                                 dtype=dtype)

        if self.metadata_proc is not None:
            try:
                df = self.metadata_proc(df, src_metadata)
            except Exception as e:
                logger.error(
                    'Failed to execute metadata processor "{}" due to error {}'
                    .format(self.metadata_proc_name, e))
                raise ExectionError(
                    'Failed to execute metadata processor "{}"'.format(
                        self.metadata_proc_name))

        return df