class Extractor:
    """
    """
    def __init__(self, db_table):
        """Initializing
        """
        self.db_manager = DBManager()
        self.db_table = db_table
        self.size = 0
        self.features = pd.DataFrame()
        self.extracted_data = dict()
        self.common_keys = dict()

    def extract_features(self, lookup):
        """
        Extract new features from DB.
        Mapping each feature with his data (from DB).
        Handling missing data.
        Clean (if choose it) sparse features by THRESHOLD
        Normalize features data (LabelEncoder)
        Close DB connection by DBManager
        :return:
        """
        self.build(lookup)
        self.extract()
        self.data_handling(clean_sparse=False)
        self.db_manager.db_close()

    def build(self, entities):
        """
        :param entities:
        :return:
        """
        try:
            self.size = len(entities)
            self.prepare_database()
            for entity in entities:
                if entity:
                    self.fetch(
                        Utils.normalize_name(Utils.normalize_uri(entity)))
        except Error as e:
            print("Error while build features", e)

    def fetch(self, entity):
        """
        Fetch from database the entity data (if exists)
        :param entity:
        :return:
        """
        # return a list of items
        result = self.db_manager.db_fetch(self.db_table, entity)
        if result and result is not None and len(result) > 0:
            for item in result:
                if 'name' in item.keys() and item['name']:
                    self.extracted_data[item['name']] = item
                    if item['data']:
                        self.prepare_common_keys(item['data'])

    def prepare_common_keys(self, item):
        """
        :param item:
        :return:
        """
        for key in item.keys():
            if key in self.common_keys:
                self.common_keys[key] = self.common_keys[key] + 1
            else:
                self.common_keys[key] = 1

    def extract(self):
        """
        :return:
        """
        for feature in self.common_keys.keys():
            values = []
            for i in range(0, self.size):
                values.append(None)

            for i, name in enumerate(self.extracted_data):
                entity = self.extracted_data.get(name)
                if feature in entity['data'].keys():
                    values[i] = entity['data'][feature]

            self.features[feature] = values

    def data_handling(self, clean_sparse=True):
        """
        Clean sparse features (if it choose)
        Handling missing data
        Fit & Transform data (Normalize)
        :param clean_sparse:
        :return:
        """
        if clean_sparse:
            self.clean_sparse_features()

        le = preprocessing.LabelEncoder()

        for feature, feature_data in self.features.items():

            feature_type = Utils.find_feature_type(self.features[feature])

            values = Utils.init_values(feature_type, self.size)

            for i, value in enumerate(feature_data):
                if feature_type == list:
                    values[i] = 0 if value is None else len(value)
                else:
                    if type(value) == list:
                        value = value[0]
                    values[i] = "" if value is None else value.replace(
                        "\"", "")

            feature_type = Utils.find_feature_type(values)
            values = Utils.missing_data(feature_type, values)
            values = le.fit_transform(values)

            self.features[feature] = values

    def clean_sparse_features(self):
        """
        Clean sparse features by THRESHOLD
        :return:
        """
        # Common features sorted
        var = {
            k: v
            for k, v in reversed(
                sorted(self.common_keys.items(), key=lambda item: item[1]))
        }

        for feature in self.features.keys():
            values_count = self.features[feature].value_counts()
            if round(len(values_count) / len(self.features[feature]),
                     2) < THRESHOLD:
                self.features.drop(feature, 1, inplace=True)

    def prepare_database(self):
        """
        Check if DB is connected, else establish a connection
        :return:
        """
        if self.db_manager and self.db_manager is not None and self.db_manager.is_connected(
        ):
            return True
        elif self.db_manager is not None and not self.db_manager.is_connected(
        ):
            self.db_manager.db_connect()
            return self.db_manager.is_connected()
        return False
Ejemplo n.º 2
0
class Parser:
    """
    Parser Freebase triples into DB tables according to entity type
    """
    def __init__(self):
        """
        Initializing
        """
        self.db_manager = DBManager()

    def init_database(self):
        """
        :return:
        """
        if self.db_manager is not None and not self.db_manager.is_connected():
            self.db_manager.db_connect()
            self.db_manager.db_init()
            return self.db_manager.is_connected()
        return False

    def read_data(self, file):
        """
        :param file:
        :return:
        """
        try:
            iTotal = 0
            current_mid = ""
            current_topic = dict()
            with gzip.open(file, 'rt') as f:
                for line in f:
                    subject, predicate, object = Utils.parse_triple(line)
                    if subject == current_mid:
                        if predicate not in current_topic:
                            current_topic[predicate] = [object]
                        else:
                            current_topic[predicate].append(object)
                    elif current_mid:
                        self.prepare_to_save(subject, current_topic)

                        current_topic.clear()

                    current_mid = subject

                    iTotal = iTotal + 1
                    if 0 == (iTotal % 1000000):
                        print("iTotal: ", iTotal)
                        print()

        except Error as e:
            print("Error while reading file", e)

    def prepare_to_save(self, subject, current_topic):
        """
        :param subject:
        :param current_topic:
        :return:
        """
        if '/type/object/type' in current_topic:
            for iType in current_topic['/type/object/type']:
                for allowed_type_key, allowed_type_table in ALLOWED_ENTITIES.items(
                ):
                    if re.search(allowed_type_key, iType):
                        # Save to DB
                        if self.prepare_database():
                            self.save_to_database(allowed_type_table, subject,
                                                  current_topic)
                            break

    def prepare_database(self):
        """
        :return:
        """
        if self.db_manager and self.db_manager is not None and self.db_manager.is_connected(
        ):
            return True
        elif self.db_manager is not None and not self.db_manager.is_connected(
        ):
            self.db_manager.db_connect()
            return self.db_manager.is_connected()
        return False

    def save_to_database(self, table, subject, current_topic):
        """
        :param table:
        :param subject:
        :param current_topic:
        :return:
        """
        current_topic = Utils.handle_duplicate(current_topic)
        current_topic = Utils.handle_language(current_topic)

        if current_topic is None or len(current_topic) == 0:
            return

        name = ''
        if '/type/object/name' in current_topic.keys():
            if isinstance(current_topic['/type/object/name'], list):
                name = current_topic['/type/object/name'][0]
            else:
                name = current_topic['/type/object/name']
            current_topic.pop('/type/object/name')
        elif 'label' in current_topic.keys():
            if isinstance(current_topic['label'], list):
                name = current_topic['label'][0]
            else:
                name = current_topic['label']
            current_topic.pop('label')

        if name and current_topic is not None and len(current_topic) > 0:
            name = Utils.clean_lang(name)
            self.db_manager.db_insert(table, name, subject,
                                      json.dumps(current_topic))