コード例 #1
0
ファイル: cleaner_base.py プロジェクト: qrodoo-dev/backend
class CleanerBase:
    '''
    classdocs
    '''
    def __init__(self, data_adapter_config_path, source_name, schema_file):
        '''
        Constructord
        '''
        self.logger = Logger("cleaner", source_name)
        self.data_raw_adapter = DataRawAdapter(data_adapter_config_path,
                                               source_name, self.logger)
        self.data_clean_adapter = DataCleanAdapter(data_adapter_config_path,
                                                   source_name, self.logger)
        self.source_name = source_name
        self.get_schema(schema_file)

    def get_schema(self, schema_file):
        f = open(schema_file)
        schema = BeautifulSoup(f.read())
        f.close()
        self.required_fields = []
        self.optional_fields = []
        for field in schema.findAll("field"):
            schema_field = SchemaField()
            schema_field.name = field

        pass

    def clean(self, url_hash, url, features, images):
        pass

    def run(self):
        for url_hash, url, features, images in self.data_raw_adapter.get_uncleaned_data_raw(
        ):
            success = self.clean(url_hash, url, features, images)
            if not success:
                self.data_raw_adapter.update_data_raw_status(
                    url_hash, DataRawStatus.CLEAN_ERROR)
            else:
                self.data_raw_adapter.update_data_raw_status(
                    url_hash, DataRawStatus.CLEANED)
コード例 #2
0
ファイル: cleaner_base.py プロジェクト: qrodoo-dev/backend
class CleanerBase:
    '''
    classdocs
    '''
    def __init__(self, data_adapter_config_path, source_name, schema_file):
        '''
        Constructord
        '''
        self.logger = Logger("cleaner", source_name)
        self.data_raw_adapter = DataRawAdapter(data_adapter_config_path, source_name, self.logger)
        self.data_clean_adapter = DataCleanAdapter(data_adapter_config_path, source_name, self.logger)
        self.source_name = source_name
        self.get_schema(schema_file)
        
    def get_schema(self, schema_file):
        f = open(schema_file)
        schema = BeautifulSoup(f.read())
        f.close()
        self.required_fields = []
        self.optional_fields = []
        for field in schema.findAll("field"):
            schema_field = SchemaField()
            schema_field.name = field
        
        
        pass

    def clean(self, url_hash, url, features, images):
        pass
        
    def run(self):
        for url_hash, url, features, images in self.data_raw_adapter.get_uncleaned_data_raw():
            success = self.clean(url_hash, url, features, images)
            if not success:
                self.data_raw_adapter.update_data_raw_status(url_hash, DataRawStatus.CLEAN_ERROR)
            else:
                self.data_raw_adapter.update_data_raw_status(url_hash, DataRawStatus.CLEANED)