Exemple #1
0
 def __init__(self, data: pd.DataFrame):
     # This takes the dataframe and computes all the inputs to the Facets
     # Overview plots such as:
     # - numeric variables: histogram bins, mean, min, median, max, etc..
     # - categorical variables: num unique, counts per category for bar chart,
     #     top category, etc.
     gfsg = GenericFeatureStatisticsGenerator()
     self._proto = gfsg.ProtoFromDataFrames([{
         'name': 'data',
         'table': data
     }], )
Exemple #2
0
 def _display_overview(self, data, update=None):
     gfsg = GenericFeatureStatisticsGenerator()
     proto = gfsg.ProtoFromDataFrames([{'name': 'data', 'table': data}])
     protostr = base64.b64encode(proto.SerializeToString()).decode('utf-8')
     if update:
         script = _OVERVIEW_SCRIPT_TEMPLATE.format(display_id=update,
                                                   protostr=protostr)
         display_javascript(Javascript(script))
     else:
         html = _OVERVIEW_HTML_TEMPLATE.format(
             display_id=self._overview_display_id, protostr=protostr)
         display(HTML(html))
Exemple #3
0
  def _display_overview(self, data, update=None):
    if (not data.empty and self._include_window_info and
        all(column in data.columns
            for column in ('event_time', 'windows', 'pane_info'))):
      data = data.drop(['event_time', 'windows', 'pane_info'], axis=1)

    gfsg = GenericFeatureStatisticsGenerator()
    proto = gfsg.ProtoFromDataFrames([{'name': 'data', 'table': data}])
    protostr = base64.b64encode(proto.SerializeToString()).decode('utf-8')
    if update:
      script = _OVERVIEW_SCRIPT_TEMPLATE.format(
          display_id=update._overview_display_id, protostr=protostr)
      display_javascript(Javascript(script))
    else:
      html = _OVERVIEW_HTML_TEMPLATE.format(
          display_id=self._overview_display_id, protostr=protostr)
      display(HTML(html))
Exemple #4
0
def tables1():
    target = os.path.join(APP_ROOT, "/home/aayushi/ml-simu")
    # print(target)

    if not os.path.isdir(target):
        os.mkdir(target)

    file = request.files["file"]
    # print(file)
    destination = os.path.join(target, file.filename)
    print(destination)
    # d = destination
    file.save(destination)
    data = pd.read_csv(destination)
    from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator
    import base64

    gfsg = GenericFeatureStatisticsGenerator()
    proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': data}])
    protostr1 = base64.b64encode(proto.SerializeToString()).decode("utf-8")
    # print(protostr1)

    return render_template("examples/tables.html", data=protostr1)
class GenericFeatureStatisticsGeneratorTest(googletest.TestCase):
    def setUp(self):
        self.gfsg = GenericFeatureStatisticsGenerator()

    def testProtoFromDataFrames(self):
        data = [[1, 'hi'], [2, 'hello'], [3, 'hi']]
        df = pd.DataFrame(data,
                          columns=['testFeatureInt', 'testFeatureString'])
        dataframes = [{'table': df, 'name': 'testDataset'}]
        p = self.gfsg.ProtoFromDataFrames(dataframes)

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('testDataset', test_data.name)
        self.assertEqual(3, test_data.num_examples)
        self.assertEqual(2, len(test_data.features))

        if test_data.features[0].name == 'testFeatureInt':
            numfeat = test_data.features[0]
            stringfeat = test_data.features[1]
        else:
            numfeat = test_data.features[1]
            stringfeat = test_data.features[0]

        self.assertEqual('testFeatureInt', numfeat.name)
        self.assertEqual(self.gfsg.fs_proto.INT, numfeat.type)
        self.assertEqual(1, numfeat.num_stats.min)
        self.assertEqual(3, numfeat.num_stats.max)
        self.assertEqual('testFeatureString', stringfeat.name)
        self.assertEqual(self.gfsg.fs_proto.STRING, stringfeat.type)
        self.assertEqual(2, stringfeat.string_stats.unique)

    def testNdarrayToEntry(self):
        arr = np.array([1.0, 2.0, None, float('nan'), 3.0], dtype=float)

        entry = self.gfsg.NdarrayToEntry(arr)
        self.assertEqual(2, entry['missing'])

        arr = np.array(['a', 'b', float('nan'), 'c'], dtype=str)
        entry = self.gfsg.NdarrayToEntry(arr)
        self.assertEqual(1, entry['missing'])

    def testNdarrayToEntryTimeTypes(self):
        arr = np.array(
            [np.datetime64('2005-02-25'),
             np.datetime64('2006-02-25')],
            dtype=np.datetime64)
        entry = self.gfsg.NdarrayToEntry(arr)
        self.assertEqual([1109289600000000000, 1140825600000000000],
                         entry['vals'])

        arr = np.array(
            [np.datetime64('2009-01-01') - np.datetime64('2008-01-01')],
            dtype=np.timedelta64)
        entry = self.gfsg.NdarrayToEntry(arr)
        self.assertEqual([31622400000000000], entry['vals'])

    def testDTypeToType(self):
        self.assertEqual(self.gfsg.fs_proto.INT,
                         self.gfsg.DtypeToType(np.dtype(np.int32)))
        # Boolean and time types treated as int
        self.assertEqual(self.gfsg.fs_proto.INT,
                         self.gfsg.DtypeToType(np.dtype(np.bool)))
        self.assertEqual(self.gfsg.fs_proto.INT,
                         self.gfsg.DtypeToType(np.dtype(np.datetime64)))
        self.assertEqual(self.gfsg.fs_proto.INT,
                         self.gfsg.DtypeToType(np.dtype(np.timedelta64)))
        self.assertEqual(self.gfsg.fs_proto.FLOAT,
                         self.gfsg.DtypeToType(np.dtype(np.float32)))
        self.assertEqual(self.gfsg.fs_proto.STRING,
                         self.gfsg.DtypeToType(np.dtype(np.str)))
        # Unsupported types treated as string for now
        self.assertEqual(self.gfsg.fs_proto.STRING,
                         self.gfsg.DtypeToType(np.dtype(np.void)))

    def testGetDatasetsProtoFromEntriesLists(self):
        entries = {}
        entries['testFeature'] = {
            'vals': [1, 2, 3],
            'counts': [1, 1, 1],
            'missing': 0,
            'type': self.gfsg.fs_proto.INT
        }
        datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}]
        p = self.gfsg.GetDatasetsProto(datasets)

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('testDataset', test_data.name)
        self.assertEqual(3, test_data.num_examples)
        self.assertEqual(1, len(test_data.features))
        numfeat = test_data.features[0]
        self.assertEqual('testFeature', numfeat.name)
        self.assertEqual(self.gfsg.fs_proto.INT, numfeat.type)
        self.assertEqual(1, numfeat.num_stats.min)
        self.assertEqual(3, numfeat.num_stats.max)
        hist = numfeat.num_stats.common_stats.num_values_histogram
        buckets = hist.buckets
        self.assertEqual(self.gfsg.histogram_proto.QUANTILES, hist.type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(1, buckets[0].low_value)
        self.assertEqual(1, buckets[0].high_value)
        self.assertEqual(.3, buckets[0].sample_count)
        self.assertEqual(1, buckets[9].low_value)
        self.assertEqual(1, buckets[9].high_value)
        self.assertEqual(.3, buckets[9].sample_count)

    def testGetDatasetsProtoSequenceExampleHistogram(self):
        entries = {}
        entries['testFeature'] = {
            'vals': [1, 2, 2, 3],
            'counts': [1, 2, 1],
            'feat_lens': [1, 2, 1],
            'missing': 0,
            'type': self.gfsg.fs_proto.INT
        }
        datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}]
        p = self.gfsg.GetDatasetsProto(datasets)
        hist = p.datasets[0].features[
            0].num_stats.common_stats.feature_list_length_histogram
        buckets = hist.buckets
        self.assertEqual(self.gfsg.histogram_proto.QUANTILES, hist.type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(1, buckets[0].low_value)
        self.assertEqual(1, buckets[0].high_value)
        self.assertEqual(.3, buckets[0].sample_count)
        self.assertEqual(1.8, buckets[9].low_value)
        self.assertEqual(2, buckets[9].high_value)
        self.assertEqual(.3, buckets[9].sample_count)

    def testGetDatasetsProtoWithWhitelist(self):
        entries = {}
        entries['testFeature'] = {
            'vals': [1, 2, 3],
            'counts': [1, 1, 1],
            'missing': 0,
            'type': self.gfsg.fs_proto.INT
        }
        entries['ignoreFeature'] = {
            'vals': [5, 6],
            'counts': [1, 1],
            'missing': 1,
            'type': self.gfsg.fs_proto.INT
        }
        datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}]
        p = self.gfsg.GetDatasetsProto(datasets, features=['testFeature'])

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('testDataset', test_data.name)
        self.assertEqual(3, test_data.num_examples)
        self.assertEqual(1, len(test_data.features))
        numfeat = test_data.features[0]
        self.assertEqual('testFeature', numfeat.name)
        self.assertEqual(1, numfeat.num_stats.min)

    def testGetDatasetsProtoWithMaxHistigramLevelsCount(self):
        # Selected entries' lengths make it easy to compute average length
        data = [['hi'], ['good'], ['hi'], ['hi'], ['a'], ['a']]
        df = pd.DataFrame(data, columns=['testFeatureString'])
        dataframes = [{'table': df, 'name': 'testDataset'}]
        # Getting proto from ProtoFromDataFrames instead of GetDatasetsProto
        # directly to avoid any hand written values ex: size of dataset.
        p = self.gfsg.ProtoFromDataFrames(dataframes,
                                          histogram_categorical_levels_count=2)

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('testDataset', test_data.name)
        self.assertEqual(6, test_data.num_examples)
        self.assertEqual(1, len(test_data.features))
        numfeat = test_data.features[0]
        self.assertEqual('testFeatureString', numfeat.name)

        top_values = numfeat.string_stats.top_values
        self.assertEqual(3, top_values[0].frequency)
        self.assertEqual('hi', top_values[0].value)

        self.assertEqual(3, numfeat.string_stats.unique)
        self.assertEqual(2, numfeat.string_stats.avg_length)

        rank_hist = numfeat.string_stats.rank_histogram
        buckets = rank_hist.buckets
        self.assertEqual(2, len(buckets))
        self.assertEqual('hi', buckets[0].label)
        self.assertEqual(3, buckets[0].sample_count)
        self.assertEqual('a', buckets[1].label)
        self.assertEqual(2, buckets[1].sample_count)
Exemple #6
0
# -*- coding: utf-8 -*-
import pandas as pd
import dash
import dash_html_components as html
import base64
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator


DEBUG = True
data = pd.read_csv("dataset.csv")
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
gfsg = GenericFeatureStatisticsGenerator()
proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': data}])
protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

app = dash.Dash('')

app.layout = html.Div(children=[
    html.Iframe(
        width="1200",
        height="800",
        srcDoc= """
       <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
        <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
        <facets-overview id="elem"></facets-overview>
        <script>
          document.querySelector("#elem").protoInput = "{protostr}";
        </script>""".format(protostr=protostr)
    ),
])
server = app.server
        <facets-dive id="elem" height="600"></facets-dive>
        <script>
          var data = {jsonstr};
          document.querySelector("#elem").data = data;
        </script>"""
html = HTML_TEMPLATE.format(jsonstr=jsonstr)
display(html)

# Create the feature stats for the datasets and stringify it.
import base64
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator

gfsg = GenericFeatureStatisticsGenerator()
proto = gfsg.ProtoFromDataFrames([
    {
        'name': 'train',
        'table': covid_data
    },
])
protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

# Display the facets overview visualization for this data
from IPython.core.display import display, HTML

HTML_TEMPLATE = """
        <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
        <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
        <facets-overview id="elem"></facets-overview>
        <script>
          document.querySelector("#elem").protoInput = "{protostr}";
        </script>"""
html = HTML_TEMPLATE.format(protostr=protostr)
Exemple #8
0
    names=features,
    sep=r'\s*,\s*',
    skiprows=[0],
    engine='python',
    na_values="?")

# Calculate the feature statistics proto from the datasets and stringify it for use in facets overview.

# This code assumes that the facets-overview package has been installed through pip,
# along with a tensorflow (or tensorflow-gpu) package.

gfsg = GenericFeatureStatisticsGenerator()
proto = gfsg.ProtoFromDataFrames([{
    'name': 'train',
    'table': train_data
}, {
    'name': 'test',
    'table': test_data
}])
protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

HTML_TEMPLATE = """
        <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
        <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
        <facets-overview id="elem"></facets-overview>
        <script>
          document.querySelector("#elem").protoInput = "{protostr}";
        </script>"""
html = HTML_TEMPLATE.format(protostr=protostr)