def _get_groups(self, source_number):
        dal = DALMongo(self.project_id)

        records = dal.get_records(SchemaMatchingStep().class_name,
                                  source_number)
        module = self._load_module(records=records)
        return module.run()
    def _clean_source(self, source_number):
        # Se obtienen los registros
        dal = DALMongo(self.project_id)
        records = dal.get_records(ExtractionStep().class_name, source_number)

        # Make a list with columns specified by the user
        # used_cols = []
        # for col, datacleansing_modules in self.config["source{}".format(source_number)].items():
        #     if col not in used_cols:
        #         used_cols.append(col)
        #
        # all_cols = [col_obj.name for col_obj in dal.get_schema(source_number, 'ExtractionStep')]
        # extra_cols = [col for col in all_cols if col not in used_cols]

        #  Do cleansing for each column of each record
        for record in records:
            for col, datacleansing_modules in self.config["source{}".format(
                    source_number)].items():
                for datacleansing_module in datacleansing_modules:
                    module = self._load_module(datacleansing_module)
                    # TODO the module should be given only the field value (string) and not the column
                    record.columns[col] = module.run(record.columns[col])

                    # Remove extra columns
                    # for extra_col in extra_cols:
                    #     record.columns.pop(extra_col)

        self._append_result_collection(
            records, "source{}_records".format(source_number))
    def _standardise_and_tag_source(self, source_number):
        # Get cleansed records from MongoDB
        dal = DALMongo(self.project_id)
        records = dal.get_records(DataCleansingStep().class_name,
                                  source_number)

        # Run standardisation and tagging module for each column of each record
        for record in records:
            for col, standardisation_tagging_module in self.config[
                    "source{}".format(source_number)].items():
                module = self._load_module(standardisation_tagging_module)
                record.columns[col] = module.run(record.columns[col])

        self._append_result_collection(
            records, "source{}_records".format(source_number))
    def _segment_source(self, source_number):
        dal = DALMongo(self.project_id)

        records = dal.get_records(StandardisationAndTaggingStep().class_name,
                                  source_number)
        # module = self._load_module(records=records)

        # Initialize columns to store new segmented schema
        orig_schema = {}
        for c_obj in dal.get_schema(source_number):
            orig_schema[c_obj.name] = c_obj

        new_cols = orig_schema

        # Run segmentation module for each column of each record
        for record in records:
            for col_name, segmentation_module in self.config["source{}".format(
                    source_number)].items():
                module = self._load_module(segmentation_module)
                record.columns[col_name] = module.run(record.columns[col_name])

                # This is to create the new segmented schema
                for field_obj in record.columns[col_name].fields:
                    new_col_fields = new_cols[col_name].fields
                    # If a new output field was found in this column then add it to the new schema
                    if field_obj.output_field is not None and \
                            field_obj.output_field not in [field.output_field for field in new_col_fields]:
                        # TODO tags could be appended as well but for now we leave it empty
                        new_of = Field(value="n/A",
                                       tipe=field_obj.tipe,
                                       output_field=field_obj.output_field,
                                       tags=[])
                        new_cols[col_name].fields.append(new_of)

        # Reconstruct new_cols object so that the DAL can store it
        segmented_schema = []
        for col_name, col_obj in new_cols.items():
            segmented_schema.append(col_obj)

        self._append_result_collection(
            records, 'source{}_records'.format(source_number))
        self._append_result_collection(segmented_schema,
                                       'source{}_schema'.format(source_number))
    def run_implementation(self):
        """
        Firma del run particular de cada step
        Implementación por defecto
        """
        dal = DALMongo(self.project_id)
        if self.segmentation_skipped:
            dal.drop_segmentation()
            prevstep = "StandardisationAndTaggingStep"
        else:
            prevstep = "SegmentationStep"
        records1 = dal.get_records(prevstep, 1)
        records2 = dal.get_records(prevstep, 2)

        module = self._load_module(project_id=self.project_id,
                                   records1=records1,
                                   records2=records2)

        new_schema, records1, records2 = module.run()

        self._append_result_collection(records1, 'source1_records')
        self._append_result_collection(records2, 'source2_records')
        self._append_result_collection(new_schema, 'new_schema')