Ejemplo n.º 1
0
class Analyzer:

        # -------------------------------------------------------------
        # 
        #  __init__ (in_file_directory, in_file_name, 
#                         in_extension, in_delimiter, in_missing_value,
#                         in_quote, 
#                         out_file_directory, out_file_name, 
#                         out_extension, out_delimiter, 
#                         out_missing_value,
#                         raw_data_structure, clean_data_structure) 
        # 
        # -------------------------------------------------------------

        def __init__(self, in_file_directory, in_file_name, 
                         in_extension, in_delimiter, in_missing_value,
                         in_quote, 
                         out_file_directory, out_file_name, 
                         out_extension, out_delimiter, 
                         out_missing_value,
                         out_file_single_file, out_file_separate_line,
                         raw_data_structure, clean_data_structure
                         ):
                
                # initializing file handlers for in and out files
                self.in_file_handler                    = FileHandler (in_file_directory, 
                                                                       in_file_name, in_extension, in_delimiter)
                if out_file_single_file == 'on':
                    if out_file_separate_line == 'title':
                        out_file_separate_line = in_file_name
                                                                           
                self.out_file_handler                   = FileHandler (out_file_directory, 
                                                                       out_file_name, out_extension, out_delimiter,
                                                                       out_file_single_file, out_file_separate_line)
                # initializing data handler for raw and clean data
                self.raw_data_handler                   = DataHandler()
                self.raw_data_handler.structure         = raw_data_structure
                self.clean_data_handler                 = DataHandler()
                self.clean_data_handler.structure       = clean_data_structure
                # initialize the error message string
                self.error_message                      = ''
                self.unquote                            = False
                if in_quote == 'yes':
                        self.unquote                    = True
                self.in_missing_value                   = in_missing_value
                self.out_missing_value                  = out_missing_value

        # -------------------------------------------------------------
        # 
        #  check_valid ()
        # 
        # -------------------------------------------------------------

        def check_valid(self):

                valid = True

                if not self.in_file_handler.check_valid():
                        self.error_message      = self.error_message + self.in_file_handler.error_message
                        valid                   = False

                if not self.out_file_handler.check_valid():
                        self.error_message      = self.error_message + self.out_file_handler.error_message
                        valid                   = False

                if not self.raw_data_handler.check_valid():
                        self.error_message      = self.error_message + self.raw_data_handler.error_message
                        valid                   = False

                if not self.clean_data_handler.check_valid():
                        self.error_message      = self.error_message + self.clean_data_handler.error_message
                        valid                   = False

                return valid

        # -------------------------------------------------------------
        # 
        #  structure_raw_data ()
        # 
        # -------------------------------------------------------------

        def structure_raw_data(self):

                # 1- import data from file
                self.in_file_handler.import_data(False, self.in_missing_value)  # If True show first 1000, if FALSE all dataset
                self.in_file_handler.data       = filter(None, self.in_file_handler.data)
                if self.unquote == True:
                       self.in_file_handler.unquote_data()
                # 2- transfer data to data handler
                self.raw_data_handler.data      = list(self.in_file_handler.data)
                # 3- structure the data
                self.raw_data_handler.structure_data()
                
        # -------------------------------------------------------------
        # 
        #  generate_clean_data ()
        # 
        # -------------------------------------------------------------

        def generate_clean_data(self):

                # 1- transfer data to clean data handler
                self.clean_data_handler.data        = list(self.raw_data_handler.data_final)
                # 2- clean the data given the raw data structure
                self.clean_data_handler.clean_data(self.raw_data_handler.structure)

        # -------------------------------------------------------------
        # 
        #  save_clean_data ()
        # 
        # -------------------------------------------------------------

        def save_clean_data(self):

                # 1- transfer clean data to file handler
                self.out_file_handler.data      = list(self.clean_data_handler.data_final)
                # 2- save the data
                id_list = []
                for line in self.clean_data_handler.data_final :
                    #line = line.split(',')
                    id_list.append(line[0])
                    id_list.append(line[1])

                self.out_file_handler.write_data_file(self.out_missing_value)

        # -------------------------------------------------------------
        # 
        #  map_data (file_directory, file_name,
        #              file_extension, file_delimiter,
        #              kept_id_position, lost_id_position, target_positions, 
        #              drop_unreferenced_entries, target_unreferenced_entries,
        #              drop_ghosts, 
        #              remove_duplicates, target_duplicates_set,
        #              merge_entries, target_merge_set, commands )
        # 
        # -------------------------------------------------------------

        def map_data ( self, file_directory, file_name,
                      file_extension, file_delimiter,
                      replace_ids,
                      kept_id_position, lost_id_position, target_positions, 
                      drop_unreferenced_entries, target_unreferenced_entries,
                      drop_ghosts, 
                      remove_duplicates, target_duplicates_set,
                      merge_entries, target_merge_set, commands ):

                # 1 - initialize file handler
                mapper_file_handler         = FileHandler(file_directory, file_name,
                                                         file_extension, file_delimiter)
                # TODO
                # if not self.mergerFileHandler.check_valid():
                        # stop_system(_mergFileHandler.error_message)
                        
                # 2 - import merge info from file
                mapper_file_handler.import_data(False)
                
                # 3 - initialize data handler
                mapper_data_handler         = DataHandler()
                
                # 4 - transfer data
                mapper_data_handler.data    = list(mapper_file_handler.data)
                
                # 5 - prepare positions from names
                # target positions
                target_position_index = []        
                if target_positions != 'off':
                    target_position_index = self.get_indeces_from_clean_structure (target_positions)
               
                # unreferenced entities
                target_unreferenced_entries_index = []
                if drop_unreferenced_entries != 'off':
                    target_unreferenced_entries_index = self.get_indeces_from_clean_structure (target_unreferenced_entries)
                    
                # duplicate defining set positions
                target_duplicates_set_index   = []
                if remove_duplicates != 'off':
                    target_duplicates_set_index = self.get_indeces_from_clean_structure (target_duplicates_set)
               
                # merge defining set positions    
                target_merge_set_index = []
                if merge_entries != 'off':
                    target_merge_set_index = self.get_indeces_from_clean_structure (target_merge_set)
                    
                # 6 - get the translated commands
                commands = self.get_commands(commands, target_merge_set_index)
                    
                # 7 - initialize a mapper with all info
                mapper      = Mapper(list(self.clean_data_handler.data_final), 
                                                                mapper_data_handler.data,
                                                                replace_ids,
                                                                kept_id_position, lost_id_position,
                                                                target_position_index, 
                                                                drop_unreferenced_entries, target_unreferenced_entries_index,
                                                                drop_ghosts, 
                                                                remove_duplicates, target_duplicates_set_index,
                                                                merge_entries, target_merge_set_index, 
                                                                commands)   
                
                return [list(mapper.mapped_data), mapper.str_counter_ids]
                
        # -------------------------------------------------------------
        # 
        #  get_index_from_clean_structure (position) 
        # 
        # -------------------------------------------------------------
        
        def get_indeces_from_clean_structure (self, positions): 
            
            admitted_all_expression = ['all', 'All', ' ', '']
            indeces = []            
            if positions in admitted_all_expression:
                # take all positions
                indeces = range(len(self.clean_data_handler.structure)/2)
            else:
                # remove spaces before or after the ','
                positions = positions.replace(', ',',')
                positions = positions.replace(' ,',',')
                list_positions = positions.split(',')
                
                for position in list_positions:
                    if position in self.clean_data_handler.structure:
                        index = [i for i in range(len(self.clean_data_handler.structure)) if self.clean_data_handler.structure[i] == position][0]/2
                        indeces.append(index)
                    else:
                        sys.exit("ERROR: variable {0} not found in the clean data structure".format(position))
            return indeces
            
        # -------------------------------------------------------------
        # 
        #  get_commands (commands, other_fields)
        # 
        # -------------------------------------------------------------
        
        def get_commands (self, commands, other_fields): 
            
            final_commands = dict()
            
            admitted_command_expression = ['+', 'same', 'avg']
            
            
            list_commands = commands.replace(', ',',')
            list_commands = list_commands.replace(' ,',',')
            list_commands = list_commands.split(',')
            
            for command in list_commands:
                command = command.replace(' :',':')
                command = command.replace(': ',':')
                command = command.split(':')
                if command[0] in self.clean_data_handler.structure:
                        index = [i for i in range(len(self.clean_data_handler.structure)) if self.clean_data_handler.structure[i] == command[0]][0]/2
                        if command[1] in admitted_command_expression:
                            final_commands[str(index)] = command[1]
                        else:
                            sys.exit("ERROR: Command \'{0}\' not admitted".format(command[1]))
                else:
                    sys.exit("ERROR: In the Command list:\n variable \'{0}\' not found in the clean data structure".format(command[0]))
            
            # check if fields in the command are not in other_fields (fields used for merging recognition)
            for i in range(len(self.clean_data_handler.structure)/2):
                if str(i) not in final_commands:
                    if i not in other_fields:
                        final_commands[str(i)]='same'
                else:
                    if i in other_fields:
                        sys.exit("ERROR: In the Command list:\n variable \'{0}\' given for the merging set and the command set".format(command[0]))
            
            return final_commands

                
        # -------------------------------------------------------------
        # 
        #  get_clean_data ()
        # 
        # -------------------------------------------------------------

        def get_clean_data(self):

                consist_data = self.clean_data_handler.data_final
                return consist_data

        # -------------------------------------------------------------
        # 
        #  get_merged_data ()
        # 
        # -------------------------------------------------------------

        def get_merged_data(self):
                return self.Merger.data_merged