Esempio n. 1
0
def test_load(get_data_file, test_output_dir):
    file_name = get_data_file(
        "meteorites.csv",
        "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
    )

    # For reproducibility
    np.random.seed(7331)

    df = pd.read_csv(file_name)

    # Note: Pandas does not support dates before 1880, so we ignore these for this analysis
    df["year"] = pd.to_datetime(df["year"], errors="coerce")

    # Example: Constant variable
    df["source"] = "NASA"

    # Example: Boolean variable
    df["boolean"] = np.random.choice([True, False], df.shape[0])

    # Example: Mixed with base types
    df["mixed"] = np.random.choice([1, "A"], df.shape[0])

    # Example: Highly correlated variables
    df["reclat_city"] = df["reclat"] + np.random.normal(scale=5, size=(len(df)))

    # Example: Duplicate observations
    duplicates_to_add = pd.DataFrame(df.iloc[0:10].copy())

    df = pd.concat([df, duplicates_to_add], ignore_index=True)

    profile1 = ProfileReport(
        df,
        title="NASA Meteorites",
        samples={"head": 5, "tail": 5},
        duplicates={"head": 10},
        minimal=True,
        progress_bar=False,
    )

    test_output_path = test_output_dir / "NASA-Meteorites.pp"
    json1 = profile1.to_json()
    profile1.dump(test_output_path)
    _ = profile1.to_html()

    assert test_output_path.exists(), "Output file does not exist"

    profile2 = ProfileReport(df, progress_bar=False).load(test_output_path)
    # json1 are compute before dumps, so _description_set should be the same
    assert isinstance(profile2._description_set, dict)

    # profile1 is lazy, html1 are compute after dumps, so report should be None
    assert profile2._report is None

    json2 = profile2.to_json()

    # both profile should generate same output
    assert json1 == json2
	def find(self, topic_id: TopicId, start_time: datetime, end_time: datetime) -> Optional[TopicProfile]:
		schema = get_topic_schema(topic_id, self.principalService)
		if is_raw_topic(schema.get_topic()):
			raise DqcException(f'Raw topic[name={schema.get_topic().name}] is not supported for profiling.')
		storage = ask_topic_storage(schema, self.principalService)
		service = ask_topic_data_service(schema, storage, self.principalService)
		criteria = [
			EntityCriteriaExpression(
				left=ColumnNameLiteral(columnName=TopicDataColumnNames.TENANT_ID.value),
				right=self.principalService.get_tenant_id()),
			EntityCriteriaExpression(
				left=ColumnNameLiteral(columnName=TopicDataColumnNames.UPDATE_TIME.value),
				operator=EntityCriteriaOperator.GREATER_THAN_OR_EQUALS,
				right=start_time),
			EntityCriteriaExpression(
				left=ColumnNameLiteral(columnName=TopicDataColumnNames.UPDATE_TIME.value),
				operator=EntityCriteriaOperator.LESS_THAN_OR_EQUALS,
				right=end_time)
		]
		data = service.find(criteria)

		columns = [
			TopicDataColumnNames.ID.value,
			*ArrayHelper(schema.get_topic().factors).map(lambda x: x.name).to_list(),
			TopicDataColumnNames.TENANT_ID.value,
			TopicDataColumnNames.INSERT_TIME.value,
			TopicDataColumnNames.UPDATE_TIME.value
		]

		def row_to_list(row: Dict[str, Any]) -> List[Any]:
			return ArrayHelper(columns).map(lambda x: row.get(x)).to_list()

		data_frame = build_data_frame(ArrayHelper(data).map(row_to_list).to_list(), columns)
		data_frame = convert_data_frame_type_by_topic(data_frame, schema.get_topic())
		data_frame.drop([
			TopicDataColumnNames.TENANT_ID,
			TopicDataColumnNames.UPDATE_TIME,
			TopicDataColumnNames.INSERT_TIME,
			TopicDataColumnNames.AGGREGATE_ASSIST,
			TopicDataColumnNames.ID,
			TopicDataColumnNames.VERSION
		], axis=1, inplace=True, errors='ignore')

		if data_frame.empty or len(data_frame.index) == 1:
			return None
		else:
			logger.info(f'memory_usage {data_frame.memory_usage(deep=True).sum()} bytes')
			profile = ProfileReport(data_frame, title=f'{schema.get_topic().name} data profile report', minimal=True)
			json_data = profile.to_json()
			json_constants_map = {
				'-Infinity': float('-Infinity'),
				'Infinity': float('Infinity'),
				'NaN': None,
			}
			return loads(json_data, parse_constant=lambda x: json_constants_map[x])
Esempio n. 3
0
def pandas_profile(df: pd.DataFrame, result_html: str = 'report.html'):
    """
    This method will be responsible to extract a pandas profiling report from the dataset.
    Do not change this method, but run it and look through the html report it generated.
    Always be sure to investigate the profile of your dataset (max, min, missing values, number of 0, etc).
    """
    from pandas_profiling import ProfileReport

    profile = ProfileReport(df, title="Pandas Profiling Report")
    if result_html is not None:
        profile.to_file(result_html)
    return profile.to_json()
Esempio n. 4
0
def test_json(data):
    report = ProfileReport(data)
    report_json = report.to_json()
    data = json.loads(report_json)
    assert set(data.keys()) == {
        "table",
        "variables",
        "correlations",
        "missing",
        "messages",
        "package",
    }
def topic_profile(topic, from_, to_, data_source):
    topic_name = topic.name
    df = query_topic_data_by_datetime(topic.name, from_, to_, topic,
                                      data_source)
    if df.empty or len(df.index) == 1:
        return None
    else:
        log.info("memory_usage {0} bytes".format(
            df.memory_usage(deep=True).sum()))
        profile = ProfileReport(
            df,
            title="{0} data profile report".format(topic_name),
            minimal=True)
        json = profile.to_json()
        return json
Esempio n. 6
0
def test_json(data):
    report = ProfileReport(data)
    report_json = report.to_json()
    data = json.loads(report_json)
    assert set(data.keys()) == {
        "analysis",
        "correlations",
        "duplicates",
        "alerts",
        "missing",
        "package",
        "sample",
        "scatter",
        "table",
        "variables",
    }
Esempio n. 7
0
def write_csv(dirs, file_format, database_name):
    ds_set = []
    ds_header = ["ds_id", "ds_name"]
    att_cat_set = []
    att_cat_header = ["ds_id", "att_name"]
    att_num_set = []
    att_num_header = ["ds_id", "att_name"]

    ds_data = None
    ds_names = []

    set_ds = []
    set_at_num = []
    set_at_cat = []

    final_ds_set = []
    final_nom_set = []
    final_num_set = []
    final_ds_set_header = []
    final_nom_set_header = []
    final_num_set_header = []

    output_step = output_path + database_name + "/"
    input_step = input_path + database_name + "/"

    df = None
    att_list = []
    csv = []
    d = 0

    #read file
    for f in dirs[0][2]:
        sufix = f.split(".")[-1]
        dataset = f.split("." + sufix)[0]
        ds_names.append(dataset)
        f = input_step + f
        if file_format == "csv":
            df = pd.read_csv(f, error_bad_lines=False)
        if file_format == "excel":
            df = pd.read_excel(f)
        if file_format == "json":
            df = pd.read_json(f, orient='records')
            df = df.to_csv(sep="~", index=False)
            df = pd.read_csv(io.StringIO(df), error_bad_lines=False, sep="~")
        if len(df) > 10000:
            df = df.sample(10000)

        #get profile inf
        profile = ProfileReport(df, minimal=True)
        json_data = profile.to_json()
        mf_json = json.loads(json_data)

        ds_mf = mf_json["table"]
        ds_mf["name"] = ds_names[d]
        print(ds_names[d])
        ds_mf["nominal"] = {}
        ds_mf["numeric"] = {}
        set_ds.append(ds_mf)
        variables = mf_json["variables"]
        for key in variables.keys():
            at_mf = variables[key]
            at_mf["ds_id"] = d
            if at_mf["type"] == "Variable.TYPE_NUM":
                try:
                    del at_mf["histogram_data"]
                    del at_mf["scatter_data"]
                    del at_mf["histogram_bins"]
                except:
                    print("Something hapenned..meh")
                ds_mf["numeric"][key] = at_mf
            else:
                ds_mf["nominal"][key] = at_mf

        att_numeric = ds_mf.pop("numeric")
        att_cat = ds_mf.pop("nominal")
        #         data = concat_values(ds_mf)
        data = ds_mf
        try:
            del data["CAT"]
            del data["BOOL"]
            del data["NUM"]
            del data["DATE"]
            del data["URL"]
            del data["COMPLEX"]
            del data["PATH"]
            del data["FILE"]
            del data["IMAGE"]
            del data["UNSUPPORTED"]
        except:
            print("...")
        data_final = {}
        id_name = data["name"].split("__")
        if len(id_name) > 1:
            data_final["ds_id"] = id_name[0]
            data_final["dataset name"] = id_name[1]
        else:
            data_final["ds_id"] = id_name[0]
            data_final["dataset name"] = id_name[0]

        ds_row = []
        final_ds_row = []

        ds_row.append(d)
        ds_row.append(data["name"])

        if len(ds_header) <= 2:
            for key in data.keys():
                ds_header.append(key)
        for key in data.keys():
            ds_row.append(data[key])
        ds_set.append(ds_row)

        ##meta-features according alserafi
        data_final["number of instances"] = data["n"]
        data_final["number of attributes"] = data["n_var"]
        data_final["dimensionality"] = float(data["n_var"]) / float(data["n"])
        num_cat = 0
        num_num = 0
        for key_types in data["types"].keys():
            if key_types == "NUM":
                num_num += data["types"][key_types]
            else:
                print(data["types"][key_types])
                num_cat += data["types"][key_types]
        data_final["number of nominal"] = num_cat
        data_final["number of numeric"] = num_num
        data_final["percentage of nominal"] = num_cat / float(data["n_var"])
        data_final["percentage of numeric"] = num_num / float(data["n_var"])

        #missing
        data_final["missing attribute count"] = data["n_vars_with_missing"]
        data_final["missing attribute percentage"] = float(
            data["n_vars_with_missing"]) / float(data["n_var"])
        num_missing_values = []
        ptg_missing_values = []

        numeric_final = {}
        #numeric
        means = []
        for key in att_numeric.keys():
            att_num_row = []
            att_num_row.append(att_numeric[key].pop("ds_id"))
            att_num_row.append(key)
            #             att_numeric[key] = concat_values(att_numeric[key])
            if len(att_num_header) <= 2:
                for k in att_numeric[key].keys():
                    att_num_header.append(k)
            for k in att_numeric[key].keys():
                att_num_row.append(att_numeric[key][k])
            att_num_set.append(att_num_row)
            ####
            final_num_row = []
            means.append(att_numeric[key]["mean"])
            num_missing_values.append(att_numeric[key]["n_missing"])
            ptg_missing_values.append(att_numeric[key]["p_missing"])
            numeric_final["dataset id"] = data_final["ds_id"]
            numeric_final["attribute name"] = att_num_row[1]
            numeric_final["number distinct values"] = att_numeric[key][
                "distinct_count_without_nan"]
            numeric_final["percentage distinct values"] = float(
                att_numeric[key]["distinct_count_without_nan"]) / float(
                    att_numeric[key]["n"])
            numeric_final["percentage missing values"] = att_numeric[key][
                "p_missing"]
            numeric_final["mean"] = att_numeric[key]["mean"]
            numeric_final["standard deviation"] = att_numeric[key]["std"]
            numeric_final["minimum value"] = att_numeric[key]["min"]
            numeric_final["maximum value"] = att_numeric[key]["max"]
            numeric_final["range"] = att_numeric[key]["range"]
            numeric_final["coefficient of variance"] = att_numeric[key]["cv"]
            if len(final_num_set_header) == 0:
                for final_key in numeric_final.keys():
                    final_num_set_header.append(final_key)
            for final_key in numeric_final.keys():
                final_num_row.append(numeric_final[final_key])
            final_num_set.append(final_num_row)

        if len(means) == 0:
            means = [0]
        means = np.array(means)
        data_final["average of means"] = np.average(means)
        data_final["standard deviation of means"] = np.std(means)
        data_final["minimum number of means"] = np.amin(means)
        data_final["maximum number of means"] = np.amax(means)

        #nominal
        nominal_final = {}
        num_distinct = []
        for key in att_cat.keys():
            if key == "<page title>":
                continue
            att_cat_row = []
            num_distinct.append(len(att_cat[key]["value_counts"].keys()))
            att_cat_row.append(att_cat[key].pop("ds_id"))
            att_cat_row.append(key)
            vcounts = []
            pvcounts = []
            string_values = ""
            for vkey in att_cat[key]["value_counts"].keys():
                vcounts.append(float(att_cat[key]["value_counts"][vkey]))
                pvcounts.append(
                    float(att_cat[key]["value_counts"][vkey]) /
                    float(att_cat[key]["n"]))
                if string_values != "":
                    string_values = string_values + "|"
                text = att_cat[key]["value_counts"][vkey]
                text = ''.join(char for char in text if ord(char) < 128)
                text = text.replace("~", "").replace("\n", " ")
                text2 = vkey
                text2 = ''.join(char for char in text2 if ord(char) < 128)
                text2 = text2.replace("~", "").replace("\n", " ")
                string_values = string_values + str(text2) + " " + str(text)

            if len(vcounts) == 0:
                vcounts = [0]
            if len(pvcounts) == 0:
                pvcounts = [0]
            vcounts = np.array(vcounts)
            pvcounts = np.array(pvcounts)
            #             att_cat[key] = concat_values(att_cat[key])
            if len(att_cat_header) <= 2:
                for k in att_cat[key].keys():
                    att_cat_header.append(k)
            for k in att_cat[key].keys():
                att_cat_row.append(att_cat[key][k])
            att_cat_set.append(att_cat_row)
            #####
            final_nom_row = []
            num_missing_values.append(att_cat[key]["n_missing"])
            ptg_missing_values.append(att_cat[key]["p_missing"])
            nominal_final["dataset id"] = data_final["ds_id"]
            nominal_final["attribute name"] = att_cat_row[1]
            #############################33
            #             if ds_names[d] == "www.best-deal-items.com":
            #                 print("################THE KEY##############: "+key)
            #                 print(att_cat[key])
            #############################33333333
            nominal_final["number distinct values"] = att_cat[key][
                "distinct_count_without_nan"]
            nominal_final["percentage distinct values"] = float(
                att_cat[key]["distinct_count_without_nan"]) / float(
                    att_cat[key]["n"])
            nominal_final["percentage missing values"] = att_cat[key][
                "p_missing"]
            nominal_final["mean number of string values"] = np.average(vcounts)
            nominal_final[
                "standard deviation number of string values"] = np.std(vcounts)
            nominal_final["minimum number of string values"] = np.amin(vcounts)
            nominal_final["maximum number of string values"] = np.amax(vcounts)
            nominal_final["median percentage of string values"] = np.median(
                pvcounts)
            nominal_final[
                "standard deviation percentage of string values"] = np.std(
                    pvcounts)
            nominal_final["minimum percentage of string values"] = np.amin(
                pvcounts)
            nominal_final["maximum percentage of string values"] = np.amax(
                pvcounts)
            nominal_final["string values"] = string_values
            if len(final_nom_set_header) == 0:
                for final_key in nominal_final.keys():
                    final_nom_set_header.append(final_key)
            for final_key in nominal_final.keys():
                final_nom_row.append(nominal_final[final_key])
            final_nom_set.append(final_nom_row)

        data_final["average number of distintc values"] = None
        data_final["standard deviation of distintc values"] = None
        data_final["minimum number of distintc values"] = None
        data_final["maximum number of distintc values"] = None
        if len(att_cat.keys()) > 0:
            num_distinct = np.array(num_distinct)
            data_final["average number of distintc values"] = np.average(
                num_distinct)
            data_final["standard deviation of distintc values"] = np.std(
                num_distinct)
            data_final["minimum number of distintc values"] = np.amin(
                num_distinct)
            data_final["maximum number of distintc values"] = np.amax(
                num_distinct)

        ##missing
        num_missing_values = np.array(num_missing_values)
        ptg_missing_values = np.array(ptg_missing_values)
        data_final["average number of missing values"] = np.average(
            num_missing_values)
        data_final["standard deviation of missing values"] = np.std(
            num_missing_values)
        data_final["minimum number of missing values"] = np.amin(
            num_missing_values)
        data_final["maximum number of missing values"] = np.amax(
            num_missing_values)
        data_final["average number of percentage missing values"] = np.average(
            ptg_missing_values)
        data_final["standard deviation of percentage missing values"] = np.std(
            ptg_missing_values)
        data_final["minimum number of percentage missing values"] = np.amin(
            ptg_missing_values)
        data_final["maximum number of percentage missing values"] = np.amax(
            ptg_missing_values)

        if len(final_ds_set_header) == 0:
            for final_key in data_final.keys():
                final_ds_set_header.append(final_key)
        for final_key in data_final.keys():
            final_ds_row.append(data_final[final_key])

        final_ds_set.append(final_ds_row)
        d += 1

#     output_path = input_path+"/mf_output/"
#     output_path = "../input/monitor_mf_output/"
    if not os.path.exists(output_step):
        Path(output_step).mkdir(parents=True, exist_ok=True)
    df = pd.DataFrame(final_ds_set, columns=final_ds_set_header)
    df.to_csv(output_step + "ds.csv", index=False, sep="~")
    df = pd.DataFrame(final_nom_set, columns=final_nom_set_header)
    df.to_csv(output_step + "attr_nom.csv", index=False, sep="~")
    df = pd.DataFrame(final_num_set, columns=final_num_set_header)
    df.to_csv(output_step + "attr_num.csv", index=False, sep="~")
def get_EDA(file_path):
    try:
        df = pd.read_csv(file_path)
        profile = ProfileReport(df, minimal=True)
        # profile.to_file(output_file="your_report.json")
        profile = json.loads(profile.to_json())
        features = set([])
        ignore_features = set([
            "value_counts", "value_counts_with_nan",
            "value_counts_without_nan", "histogram_data", "scatter_data"
        ])
        table_features = set(["memory_size", "n_cells_missing"])
        # print(f'Profile: {profile.keys()}')
        report = defaultdict(str)
        overview = defaultdict(str)
        report["rows"] = []
        report["columns"] = [{
            "label": "Column",
            "field": "Column",
            "sort": 'asc',
            "width": 150
        }]

        #overview of table
        for feature, val in profile['table'].items():

            if feature in table_features:
                overview[feature] = list(val.values())[0]
            elif feature == "types":
                overview[feature] = defaultdict(str)
                for k, v in profile['table']['types'].items():
                    overview[feature][k] = list(v.values())[0]
            else:
                overview[feature] = val

        #info about variables
        for variable in profile['variables'].keys():
            # print("variable", variable)
            row = {"Column": variable}
            for feature, val in profile['variables'][variable].items():
                if feature not in ignore_features:
                    if feature not in features: features.add(feature)

                    if isinstance(val, dict):
                        # print(f'{feature:>30}')
                        row[feature] = list(val.values())[0]
                    elif 'date_warning' == feature and val == True:
                        row["type"] = "datetime"
                    else:
                        row[feature] = val
            report["rows"].append(row)
        for feature in features:
            report["columns"].append({
                "label": feature,
                "field": feature,
                # "sort": 'asc',
                # "width": 200
            })

        sample = json.loads(df.sample(20).to_json(orient='index'))

        sample_report = {"rows": [], "columns": []}
        key = ""
        for k, v in sample.items():

            sample_report['rows'].append(v)
            key = k
        for k in sorted(sample[key].keys()):
            sample_report['columns'].append({
                "label": k,
                "field": k,
                "width": 200
                # "sort": 'asc',
            })
        # with open('sample_json.json','w') as f:
        #     json.dump(sample_report,f)
        return {
            "report": json.dumps(report),
            "overview": overview,
            "sample": sample_report
        }
    except Exception as e:
        tb = sys.exc_info()[-1]
        print(
            f'Error: {str(e)} in line {tb.tb_lineno} in function: {traceback.extract_tb(tb, 1)[0][2]}'
        )