Beispiel #1
0
 def load_and_preprocess(self, **kwargs):
     input_dir = kwargs.get("input_dir")
     file_type = kwargs.get("file_type", "other")
     job = kwargs.get("job", None)
     start = time.time()
     if job is not None:
         job.meta['step'] = "materializing the dataset..."
         job.save_meta()
     requests_result = requests.get(input_dir)
     if requests_result.status_code // 100 != 2:
         raise ValueError("Reading file from {} failed.".format(
             str(input_dir)))
     metadata = dict()
     metadata['url'] = input_dir
     title_cleaned = input_dir.split("/")[-1]
     words_processed = remove_punctuation(title_cleaned)
     metadata['title'] = " ".join(words_processed)
     metadata['file_type'] = file_type
     all_metadata = [metadata]
     result = [requests_result.content]
     return PreParsedResult(result, all_metadata)
Beispiel #2
0
 def model_details(self, parsed_data: str, item) -> None:
     """
     Model the details of the given parsed image files, currently we just get all words and save it
     :param parsed_data: parsed text from image file
     :param item: etk knowledge graph doc
     :return: None
     """
     statement = item.add_statement('C2005', StringValue("all_data"))
     all_value_str_set = set()
     words_processed = remove_punctuation(parsed_data)
     for word in words_processed:
         all_value_str_set.add(word)
     all_value_str = " ".join(all_value_str_set)
     self._logger.debug("Totally {} words added ".format(
         str(len(all_value_str_set))))
     statement.add_qualifier('C2006', StringValue(all_value_str))  # values
     statement.add_qualifier('C2007', Item("string"))  # data structure type
     statement.add_qualifier(
         'C2008',
         URLValue('http://schema.org/Text'))  # semantic type identifier
     return item
    def process_one_column(self, column_data: pd.Series, item: WDItem, column_number: int,
                           semantic_type: typing.List[str]) -> typing.Union[WDItem, None]:
        """
        :param column_data: a pandas series data
        :param item: the target q node aimed to add on
        :param column_number: the column number
        :param semantic_type: a list indicate the semantic type of this column
        :return: a bool indicate succeeded or not
        """
        start = time.time()
        self._logger.debug("Start processing No." + str(column_number) + " column.")
        statement = item.add_statement('C2005', StringValue(column_data.name))  # variable measured
        try:
            # updated v2020.1.9, it seems dsbox profiler do not convert "year" only data, we need to check here
            if 'http://schema.org/Integer' in semantic_type and "year" in column_data.name:
                try:
                    column_data = column_data.astype("int")
                    if max(column_data) < 2100 and min(column_data) > 1000:
                        column_data = pd.to_datetime(column_data, format='%Y', errors="raise")
                        self._logger.info("Detect year data on column No.{}!".format(str(column_number)))
                except:
                    pass

            if 'http://schema.org/DateTime' in semantic_type or "datetime" in column_data.dtype.name:
                data_type = "datetime"
                semantic_type_url = "http://schema.org/DateTime"
                start_date = min(column_data)
                end_date = max(column_data)

                # updated v2019.12.12: check details, only treat as the granularity
                # if we found more than 1 values for this granularity
                time_granularity = datamart_utils.map_granularity_to_value(datamart_utils.get_time_granularity(column_data))
                start_time_str = datetime.fromtimestamp(start_date.timestamp(), tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S")
                end_time_str = datetime.fromtimestamp(end_date.timestamp(), tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S")

                start_time = TimeValue(Literal(start_time_str, type_=LiteralType.dateTime), Item('Q1985727'),
                                       time_granularity, 0)
                end_time = TimeValue(Literal(end_time_str, type_=LiteralType.dateTime), Item('Q1985727'),
                                     time_granularity, 0)

                statement.add_qualifier('C2011', start_time)
                statement.add_qualifier('C2012', end_time)
                statement.add_qualifier('C2013', QuantityValue(time_granularity))
            else:
                all_data = set(column_data.tolist())
                all_value_str_set = set()
                for each in all_data:
                    # set to lower characters, remove punctuation and split by the space
                    words_processed = remove_punctuation(each)
                    for word in words_processed:
                        all_value_str_set.add(word)
                all_value_str = " ".join(all_value_str_set)

                statement.add_qualifier('C2006', StringValue(all_value_str))  # values
                if 'http://schema.org/Float' in semantic_type:
                    semantic_type_url = 'http://schema.org/Float'
                    data_type = "float"
                elif 'http://schema.org/Integer' in semantic_type:
                    data_type = "int"
                    semantic_type_url = 'http://schema.org/Integer'
                else:  # 'http://schema.org/Text' in semantic_type:
                    data_type = "string"
                    semantic_type_url = 'http://schema.org/Text'

            statement.add_qualifier('C2007', Item(data_type))  # data structure type
            statement.add_qualifier('C2008', URLValue(semantic_type_url))  # semantic type identifier
            statement.add_qualifier('P1545', QuantityValue(column_number))  # column index
            end1 = time.time()
            self._logger.info("Processing finished, totally take " + str(end1 - start) + " seconds.")
            return item

        except Exception as e:
            self._logger.error("[ERROR] processing column No." + str(column_number) + " failed!")
            self._logger.debug(e, exc_info=True)
            return None
    def load_and_preprocess(self, **kwargs):
        input_dir = kwargs.get("input_dir")
        file_type = kwargs.get("file_type", "csv")
        job = kwargs.get("job", None)
        wikifier_choice = kwargs.get("wikifier_choice", "auto")
        start = time.time()
        self._logger.debug("Start loading from " + input_dir)
        if job is not None:
            job.meta['step'] = "materializing the dataset..."
            job.save_meta()
        from_online_file = False
        if file_type == "csv":
            try:
                _ = [pd.read_csv(input_dir, dtype=str)]
                file_type = "online_csv"
            except Exception:
                raise ValueError("Reading csv from" + input_dir + "failed.")

        if len(file_type) > 7 and file_type[:7] == "online_":
            from_online_file = True
            general_materializer = GeneralMaterializer()
            file_type = file_type[7:]
            # example: "csv"
            file_metadata = {
                "materialization": {
                    "arguments": {
                        "url": input_dir,
                        "file_type": file_type
                    }
                }
            }
            try:
                result = general_materializer.get(metadata=file_metadata).to_csv(index=False)
            except Exception as e:
                self._logger.debug(e, exc_info=True)
                raise ValueError("Loading online data from " + input_dir + " failed!")
                # remove last \n so that we will not get an extra useless row
            if result[-1] == "\n":
                result = result[:-1]
            loaded_data = StringIO(result)
            loaded_data = [pd.read_csv(loaded_data, dtype=str)]

        elif file_type == "wikitable":
            from_online_file = True
            materializer = WikitablesMaterializer()
            loaded_data, xpaths = materializer.get(input_dir)
        else:
            raise ValueError("Unsupported file type")
        end1 = time.time()
        self._logger.info("Loading finished. Totally take " + str(end1 - start) + " seconds.")
        if job is not None:
            job.meta['step'] = "materialization finished, start running wikifier..."
            job.meta['loading dataset used'] = str(timedelta(seconds=end1 - start))
            job.save_meta()

        all_wikifier_res = []
        all_metadata = []
        for df_count, each_df in enumerate(loaded_data):
            if each_df.shape[0] == 0:
                raise ValueError("Detect empty when loading No.{} table, please check!".format(str(df_count)))
            if wikifier_choice == "false":
                do_wikifier = False
            elif wikifier_choice == "true":
                do_wikifier = True
            else:
                do_wikifier = None

            # this function will also determine whether to do wikifier or not if do_wikifier = None
            do_wikifier = save_wikifier_choice(input_dataframe=each_df, choice=do_wikifier)

            if do_wikifier:
                self._logger.info("Will run wikifier!")
                # not use cache during upload
                wikifier_res = wikifier.produce(each_df, use_cache=False)

            else:
                self._logger.info("Not run wikifier!")
                wikifier_res = each_df
                # we also need to let the cache system know not to do wikifier
                produce_config = {"target_columns": None, "target_p_nodes": None,
                                  "input_type": "pandas", "wikifier_choice": None,
                                  "threshold": 0.7
                                  }

                cache_key = self.cache_manager.get_hash_key(each_df, json.dumps(produce_config))

                # add extra information after we calculate the correct hash tag
                produce_config["use_wikifier"] = False
                response = self.cache_manager. \
                    add_to_memcache(supplied_dataframe=each_df,
                                    search_result_serialized=json.dumps(produce_config),
                                    augment_results=each_df,
                                    hash_key=cache_key
                                    )
                if not response:
                    self._logger.warning("Push wikifier results to results failed!")
                else:
                    self._logger.info("Push wikifier results to memcache success!")

            end2 = time.time()
            self._logger.info("Wikifier finished. Totally take " + str(end2 - end1) + " seconds.")
            if job is not None:
                job.meta['step'] = "wikifier running finished, start generating metadata..."
                job.meta['wikifier used'] = str(timedelta(seconds=end2 - end1))
                job.save_meta()

            # process datetime column to standard datetime
            for col_name in wikifier_res.columns.values.tolist():
                if 'date' in col_name.lower() or 'time' in col_name.lower():
                    try:
                        temp = pd.to_datetime(wikifier_res[col_name])
                        has_time_format_or_not = (pd.isnull(temp) == True).value_counts()
                        if False in has_time_format_or_not.keys() and has_time_format_or_not[False] >= wikifier_res.shape[
                            0] * 0.7:
                            wikifier_res[col_name] = temp
                    except:
                        pass

            # TODO: need update profiler here to generate better semantic type
            metadata = datamart_utils.generate_metadata_from_dataframe(data=wikifier_res)
            self._logger.info("The uploaded data's shape is " + str(wikifier_res.shape))
            for i, each_column_meta in enumerate(metadata['variables']):
                self._logger.debug("Metadata for column No.{} is:".format(str(i)))
                self._logger.debug(str(each_column_meta))
                # if 'http://schema.org/Text' in each_column_meta['semantic_type']:
                # self.columns_are_string[df_count].append(i)

            if from_online_file:
                metadata['url'] = input_dir
                title_cleaned = input_dir.split("/")[-1]
                words_processed = remove_punctuation(title_cleaned)
                metadata['title'] = " ".join(words_processed)
                metadata['file_type'] = file_type
            if file_type == "wikitable":
                metadata['xpath'] = xpaths[df_count]

            all_wikifier_res.append(wikifier_res)
            all_metadata.append(metadata)

        end2 = time.time()
        self._logger.info("Preprocess finished. Totally take " + str(end2 - end1) + " seconds.")
        if job is not None:
            job.meta['step'] = "metadata generating finished..."
            job.meta['metadata generating used'] = str(timedelta(seconds=end2 - end1))
            job.save_meta()
        return PreParsedResult(all_wikifier_res, all_metadata)
    def model_data(self, doc, inputs: PreParsedResult, **kwargs):
        input_dfs = inputs.content
        metadata = inputs.metadata
        number = kwargs.get("number")  # an int
        uploader_information = kwargs.get("uploader_information")
        self._logger.debug("Start modeling data into blazegraph format...")
        start = time.time()
        job = kwargs.get("job", None)
        need_process_columns = kwargs.get("need_process_columns", None)
        if need_process_columns is None:
            need_process_columns = list(range(input_dfs[number].shape[1]))
        else:
            self._logger.info("Received specified target process columns as {}".format(str(need_process_columns)))
            for each_column_number in need_process_columns:
                if each_column_number >= input_dfs[number].shape[1]:
                    raise ValueError(
                        "The given column number {} exceed the dataset's column length as {}.".format(each_column_number, str(
                            input_dfs[number].shape[1])))

            for each_col in range(input_dfs[number].shape[1]):
                if each_col not in need_process_columns and check_is_q_node_column(input_dfs[number], each_col):
                    self._logger.info("Automatically add Q node column at No.{} {} as index list!".format(str(each_col), str(
                        input_dfs[number].columns[each_col])))
                    need_process_columns.append(each_col)

        # updated v2019.12.5: now use the md5 value of dataframe hash as the dataset id
        pandas_id = str(hash_pandas_object(input_dfs[number]).sum())
        hash_generator = hashlib.md5()
        hash_generator.update(pandas_id.encode('utf-8'))
        hash_url_key = hash_generator.hexdigest()
        modeled_data_id = hash_url_key

        if metadata is None or metadata[number] is None:
            metadata = {}
        extra_information = {}
        title = metadata[number].get("title") or ""
        keywords = metadata[number].get("keywords") or ""
        file_type = metadata[number].get("file_type") or ""
        # TODO: if no url given?
        url = metadata[number].get("url") or "http://"

        # update v2019.12.6, now adapt special requirement from keywords
        if type(keywords) is str:
            keywords_list = []
            if keywords.find(config_datamart.upload_special_requirement_mark, 0) != -1 and keywords.find(
                    config_datamart.upload_special_requirement_mark, 0) != keywords.find(
                config_datamart.upload_special_requirement_mark, 1):
                keywords_list.append(keywords[keywords.find(config_datamart.upload_special_requirement_mark, 0):
                                              keywords.find(config_datamart.upload_special_requirement_mark, 1) +
                                              len(config_datamart.upload_special_requirement_mark)])
                keywords = keywords[keywords.find(config_datamart.upload_special_requirement_mark, 1) +
                                    len(config_datamart.upload_special_requirement_mark) + 1:]
            keywords_list.extend(keywords.split(","))
        else:
            keywords_list = keywords
        words_processed = []
        for each in keywords_list:
            if each.startswith("*&#") and each.endswith("*&#"):
                self._logger.info("Special requirement from keyword area detected as {}".format(each))
                special_requirement = json.loads(each[3: -3])
                extra_information['special_requirement'] = special_requirement
            else:
                words_processed.extend(remove_punctuation(each))
        # updated v2020.1.3: now also do keywords augmentation during uploading process
        words_processed = datamart_utils.keywords_augmentation(words_processed)
        # also augment title and save as keywords
        words_processed.extend(datamart_utils.keywords_augmentation(remove_punctuation(title, "list")))
        keywords = " ".join(set(words_processed))

        node_id = 'D' + str(modeled_data_id)
        q = WDItem(node_id)
        if 'xpath' in metadata[number]:
            extra_information['xpath'] = metadata[number]['xpath']

        data_metadata = {'shape_0': input_dfs[number].shape[0], 'shape_1': input_dfs[number].shape[1]}
        for i, each in enumerate(metadata[number]['variables']):
            each_column_meta = {'semantic_type': each['semantic_type'], 'name': input_dfs[number].columns[i]}
            extra_information['column_meta_' + str(i)] = each_column_meta
        extra_information['data_metadata'] = data_metadata

        # updated v2019.10.14, add first 10 rows of each dataset in extra information for checking
        extra_information['first_10_rows'] = input_dfs[number].loc[:10].to_csv()
        # updated v2019.10.14, trying to save a local backup of the downloaded dataframe to increase the speed
        hash_generator = hashlib.md5()
        hash_generator.update(url.encode('utf-8'))
        hash_url_key = hash_generator.hexdigest()
        dataset_cache_loc = os.path.join(config_datamart.cache_file_storage_base_loc, "datasets_cache")
        cache_file_loc = os.path.join(dataset_cache_loc, hash_url_key + ".h5")
        if not os.path.exists(dataset_cache_loc):
            os.mkdir(dataset_cache_loc)

        input_dfs[number].to_hdf(cache_file_loc, key='df', mode='w', format='fixed')
        extra_information['local_storage'] = cache_file_loc

        # for each_key in ["", ]:
        #     if each_key not in uploader_information:
        #         uploader_information[each_key] = "None"

        q.add_label(node_id, lang='en')
        q.add_statement('P31', Item('Q1172284'))  # indicate it is subclass of a dataset
        q.add_statement('P2699', URLValue(url))  # url
        q.add_statement('P2701', StringValue(file_type))  # file type
        q.add_statement('P1476', MonolingualText(title, lang='en'))  # title
        q.add_statement('C2001', StringValue(node_id))  # datamart identifier
        q.add_statement('C2004', StringValue(keywords))  # keywords
        q.add_statement('C2010', StringValue(json.dumps(extra_information)))
        q.add_statement('C2014', StringValue(json.dumps(uploader_information)))

        end1 = time.time()
        if job is not None:
            job.meta['step'] = "Modeling abstract data finished."
            job.meta['modeling abstract'] = str(timedelta(seconds=end1 - start))
            job.save_meta()

        self._logger.info("Modeling abstract data finished. Totally take " + str(end1 - start) + " seconds.")

        # each columns
        for i in need_process_columns:
            if job is not None:
                job.meta['step'] = "Modeling ({}/{}) column ...".format(str(i), str(input_dfs[number].shape[1]))
                job.save_meta()
            try:
                semantic_type = metadata[number]['variables'][i]['semantic_type']
            except IndexError:
                semantic_type = 'http://schema.org/Text'
            model_column_time_limit = 600
            self._logger.info(
                "Currently setting modeling each column maximum time as " + str(model_column_time_limit) + " seconds.")
            # use timeout to prevent stuck on some columns
            res = timeout_call(model_column_time_limit, self.process_one_column,
                               [input_dfs[number].iloc[:, i], q, i, semantic_type])
            # res = self.process_one_column(column_data=input_dfs[number].iloc[:, i], item=q, column_number=i,
            #                               semantic_type=semantic_type)
            if res is None:
                self._logger.error("Error when modeling column " + str(i) + ". Maybe timeout? Will skip.")
            else:
                q = res
        doc.kg.add_subject(q)
        end2 = time.time()
        self._logger.info("Modeling detail data finished. Totally take " + str(end2 - end1) + " seconds.")
        if job is not None:
            job.meta['step'] = "Modeling finished. Start uploading..."
            job.meta['modeling'] = str(timedelta(seconds=end2 - end1))
            job.save_meta()
        # return the updated etc doc and corresponding dataset id
        return doc, node_id