Example #1
0
 def _from_json():
     try:
         return pd.read_json(json)
     except ValueError:
         return pd.read_json(json, typ="series")
     except:
         raise
Example #2
0
def legacy_kdata_to_csv():
    for index, security_item in get_security_list().iterrows():
        for fuquan in (True, False):
            dir = get_kdata_dir_old(security_item, fuquan)
            if os.path.exists(dir):
                files = [os.path.join(dir, f) for f in os.listdir(dir) if
                         ('all' not in f and 'json' in f and os.path.isfile(os.path.join(dir, f)))]

                for f in files:
                    tmp = os.path.basename(f).split('_')
                    if fuquan:
                        csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'hfq')
                        if not os.path.exists(csv_path):
                            df = pd.read_json(f, dtype={'code': str})
                            logger.info("{} to {}".format(f, csv_path))

                            df = df.loc[:,
                                 ['timestamp', 'code', 'low', 'open', 'close', 'high', 'volume', 'turnover',
                                  'securityId',
                                  'fuquan']]
                            df.columns = KDATA_COLUMN_FQ

                            df.to_csv(csv_path, index=False)
                    else:
                        csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'bfq')
                        if not os.path.exists(csv_path):
                            df = pd.read_json(f, dtype={'code': str})
                            logger.info("{} to {}".format(f, csv_path))

                            df = df.loc[:, KDATA_COLUMN]

                            df.to_csv(csv_path, index=False)
Example #3
0
def main():
    emotions = pd.read_json('data/emotions.json')
    landmarks = pd.read_json('data/landmarks.json')
    facs = pd.read_json('data/facs.json')

    header = ['min', 'max', 'mean', 'std', 'count']

    print('===Emotions===')
    emotions_dict = emotions[['emotion']].describe().to_dict()
    emotions_dict['emotion']['count (incl. null values)'] = len(emotions)
    print(wiki_tables(emotions_dict, header+['count (incl. null values)']))

    print('===Landmarks===')
    print(wiki_tables(landmarks[['x', 'y']].describe().to_dict(), header))

    print('===FACS===')
    print("====Summary====")
    print(wiki_tables(facs[['au', 'intensity']].describe().to_dict(), header))

    print("====Details for each AU====")
    grouped_facs = facs.groupby(['au'])['intensity'].describe().to_dict()
    facs_dict = {}
    for au, field in grouped_facs:
        if not au in facs_dict:
            facs_dict[au] = {}

        facs_dict[au][field] = grouped_facs[(au, field)]

    print(wiki_tables(facs_dict, header, title='AU'))
Example #4
0
def read_files():
    # I hard-coded file names
    article_art = pd.read_json(open("./article.art"))
    article_rev = pd.read_json(open("./article.rev"))
    article_rev["is_talk"] = 0

    talk_art = pd.read_json(open("./talk.art"))
    talk_rev = pd.read_json(open("./talk.rev"))
    talk_rev["is_talk"] = 1

    revs = pd.concat([talk_rev, article_rev])
    arts = article_art.set_index("title").join(talk_art.set_index("title"), rsuffix="_talk").reset_index()

    if "anon" not in revs.columns:
        revs["anon"] = np.nan
    if "minor" not in revs.columns:
        revs["minor"] = np.nan
    if "suppressed" not in revs.columns:
        revs["suppressed"] = np.nan
    if "userhidden" not in revs.columns:
        revs["userhidden"] = np.nan

    revs.loc[~revs.anon.isnull(), "anon"] = 1
    revs.loc[~revs.minor.isnull(), "minor"] = 1
    revs.loc[~revs.suppressed.isnull(), "suppressed"] = 1
    revs.loc[~revs.userhidden.isnull(), "userhidden"] = 1

    revs.loc[revs.anon.isnull(), "anon"] = 0
    revs.loc[revs.minor.isnull(), "minor"] = 0
    revs.loc[revs.suppressed.isnull(), "suppressed"] = 0
    revs.loc[revs.userhidden.isnull(), "userhidden"] = 0

    return [arts, revs]
Example #5
0
 def jiejin_quarter(self):   #利用解禁数据的时间序列合成解禁数据
     framewhatday1 = pd.DataFrame(columns=self.data_frame.columns)
     framewhatday2 = pd.DataFrame(columns=self.data_frame.columns)
     framewhatday3 = pd.DataFrame(columns=self.data_frame.columns)
     netprofit = pd.read_json('dataframe/' + NETPROFIT_NETPROFIT + '.json')
     netprofit = netprofit.sort_index(axis=1)
     holder_top10pct = pd.read_json('dataframe/' + ES_HOLDERS_PCT_HOLDER_TOP10PCT + '.json')
     holder_top10pct = holder_top10pct.sort_index(axis=1)
     holder_pctbyinst = pd.read_json('dataframe/' + ES_HOLDERS_PCT_HOLDER_PCTBYINST + '.json')
     holder_pctbyinst = holder_pctbyinst.sort_index(axis=1)
     for date in self.datelist:
         quarterday = what_quarter(str(date).split()[0])
         if quarterday[1] in netprofit.index:
             framewhatday1.loc[date] = (netprofit.loc[quarterday[0]] / netprofit.loc[quarterday[1]] - 1)
         else:
             framewhatday1.loc[date] = None
         if quarterday[2] in holder_top10pct.index:
             framewhatday2.loc[date] = holder_top10pct.loc[quarterday[2]] / 100
         else:
             framewhatday2.loc[date] = 0
         if quarterday[2] in holder_pctbyinst.index:
             framewhatday3.loc[date] = holder_pctbyinst.loc[quarterday[2]] / 100
         else:
             framewhatday3.loc[date] = 0
     framewhatday1 = framewhatday1.fillna(0)   #净利润为空的直接设为0
     return {JIEJIN_DATE:self.data_frame,NETPROFIT_NETPROFIT:framewhatday1,ES_HOLDERS_PCT_HOLDER_TOP10PCT:framewhatday2,ES_HOLDERS_PCT_HOLDER_PCTBYINST:framewhatday3}
Example #6
0
def test_lines_with_compression(compression):
    with tm.ensure_clean() as path:
        df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
        df.to_json(path, orient='records', lines=True, compression=compression)
        roundtripped_df = pd.read_json(path, lines=True,
                                       compression=compression)
        assert_frame_equal(df, roundtripped_df)
Example #7
0
 def setUp(self):
     answers = gpd.read_file(DIRPATH + '/answers.geojson')
     tests = pd.read_json(DIRPATH + '/tests.json')
     hard_tests = pd.read_json(DIRPATH + '/degenerate.json')
     self.all = answers.merge(tests, on='names').merge(hard_tests, on='names')
     self.conn = Connection('DECENNIALSF12010')
     self.conn.set_mapservice('tigerWMS_Census2010')
    def get(self, source='train'):
        ''' Take data from source file and
            create object instanse with data
            from file.

        Parameters
        ----------
        source : string
            can be 'train' or 'test' and show what file
            take for pretaring.

        Results
        -------
        train_data : Pandas DataFrame, [m rows x n columns]
            example for train mode:
            -----------------------
            index           cuisine     id                 ingredients
                0             greek  10259  [romaine lettuce, black...
                1       southern_us  25693  [plain flour, ground pe...
                2          filipino  20130  [eggs, pepper, salt, ma...
        test_data : Pandas DataFrame, [m rows x n columns]    
            example for test mode:
            ----------------------
            index      id                                  ingredients
                0   10259  [romaine lettuce, black olives, grape to...
                1   25693  [plain flour, ground pepper, salt, tomat...
                2   20130  [eggs, pepper, salt, mayonaise, cooking ...
        '''
        data_file_name = self.folder + source + '.json'
        if source == 'train':
            self.train_data = pd.read_json(data_file_name)
            # self._vocabulary()
        self.test_data = pd.read_json(data_file_name)
Example #9
0
def main():
    d = json.loads(sys.stdin.read())

    print('*** EM iterations ***')
    for i in range(4):
        #s = arbplf_newton_point(json.dumps(d))
        s = arbplf_coeff_expect(json.dumps(d))
        df = pd.read_json(StringIO(s), orient='split', precise_float=True)
        r = list(df.value)
        d['model_and_data']['edge_rate_coefficients'] = r
        print('EM summary:')
        print(r)
        summarize(d)
        print()

    print('*** newton iterations ***')
    for i in range(6):
        s = arbplf_newton_point(json.dumps(d))
        df = pd.read_json(StringIO(s), orient='split', precise_float=True)
        r = list(df.value)
        d['model_and_data']['edge_rate_coefficients'] = r
        print('newton summary:')
        print(r)
        summarize(d)
        print()
Example #10
0
    def test_frame_non_unique_columns(self):
        df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 2], columns=["x", "x"])

        self.assertRaises(ValueError, df.to_json, orient="index")
        self.assertRaises(ValueError, df.to_json, orient="columns")
        self.assertRaises(ValueError, df.to_json, orient="records")

        assert_frame_equal(df, read_json(df.to_json(orient="split"), orient="split", dtype=False))
        unser = read_json(df.to_json(orient="values"), orient="values")
        np.testing.assert_equal(df.values, unser.values)

        # GH4377; duplicate columns not processing correctly
        df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 2], columns=["x", "y"])
        result = read_json(df.to_json(orient="split"), orient="split")
        assert_frame_equal(result, df)

        def _check(df):
            result = read_json(df.to_json(orient="split"), orient="split", convert_dates=["x"])
            assert_frame_equal(result, df)

        for o in [
            [["a", "b"], ["c", "d"]],
            [[1.5, 2.5], [3.5, 4.5]],
            [[1, 2.5], [3, 4.5]],
            [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]],
        ]:
            _check(DataFrame(o, index=[1, 2], columns=["x", "x"]))
Example #11
0
def read_salmon_qc(sample_path, flen_lim=(100, 100), version='0.7.2'):
    ''' Parse technical quality control data from a Salmon quantification
    result.

    Parameters
    ----------
    flen_lim, tuple (int start, int end), default (100, 100)
        How many bases to remove from start and end of fragment length
        distribution when calculating the robust mode. This is too see if
        things roughly worked out even if the max FLD Salmon parameter was
        set too small.

    version, str, default '0.6.0'
        The version of Salmon which generated the directory. Currently
        supports '0.6.0' and '0.4.0'. (Other versions might be compatible
        with these.)

    Returns
    -------
    A pandas.Series with technical information from the Salmon results for
    the sample.
    '''
    try:
        flen_dist = np.fromfile(sample_path + '/libParams/flenDist.txt', sep='\t')
        global_fl_mode = flen_dist.argmax()
        robust_fl_mode = flen_dist[flen_lim[0]:-flen_lim[1]].argmax() + flen_lim[0]
    except FileNotFoundError:
        global_fl_mode = 0
        robust_fl_mode = 0

    if version == '0.7.2':
        qc_data = pd.read_json(sample_path + '/aux_info/meta_info.json', typ='series')
        qc_data = qc_data[['num_processed', 'num_mapped', 'percent_mapped']]
        qc_data['global_fl_mode'] = global_fl_mode
        qc_data['robust_fl_mode'] = robust_fl_mode

    if version == '0.6.0':
        qc_data = pd.read_json(sample_path + '/aux/meta_info.json', typ='series')
        qc_data = qc_data[['num_processed', 'num_mapped', 'percent_mapped']]
        qc_data['global_fl_mode'] = global_fl_mode
        qc_data['robust_fl_mode'] = robust_fl_mode

    if version == '0.4.0':
        qc_data = pd.Series()
        log_file = sample_path + '/logs/salmon_quant.log'
        with open(log_file) as fh:
            for l in fh:
                if 'Observed ' in l:
                    frags = int(l.split('Observed ')[-1].split(' total')[0])
                    qc_data['num_processed'] = frags

                if 'mapping rate' in l:
                    rate = float(l.split(' = ')[1].split('%')[0])
                    qc_data['percent_mapped'] = rate


        qc_data['global_fl_mode'] = global_fl_mode
        qc_data['robust_fl_mode'] = robust_fl_mode

    return qc_data
Example #12
0
    def test_v12_compat(self):
        df = DataFrame(
            [
                [1.56808523, 0.65727391, 1.81021139, -0.17251653],
                [-0.2550111, -0.08072427, -0.03202878, -0.17581665],
                [1.51493992, 0.11805825, 1.629455, -1.31506612],
                [-0.02765498, 0.44679743, 0.33192641, -0.27885413],
                [0.05951614, -2.69652057, 1.28163262, 0.34703478],
            ],
            columns=["A", "B", "C", "D"],
            index=pd.date_range("2000-01-03", "2000-01-07"),
        )
        df["date"] = pd.Timestamp("19920106 18:21:32.12")
        df.ix[3, "date"] = pd.Timestamp("20130101")
        df["modified"] = df["date"]
        df.ix[1, "modified"] = pd.NaT

        v12_json = os.path.join(self.dirpath, "tsframe_v012.json")
        df_unser = pd.read_json(v12_json)
        assert_frame_equal(df, df_unser)

        df_iso = df.drop(["modified"], axis=1)
        v12_iso_json = os.path.join(self.dirpath, "tsframe_iso_v012.json")
        df_unser_iso = pd.read_json(v12_iso_json)
        assert_frame_equal(df_iso, df_unser_iso)
Example #13
0
    def test_frame_mixedtype_orient(self):  # GH10289
        vals = [
            [10, 1, "foo", 0.1, 0.01],
            [20, 2, "bar", 0.2, 0.02],
            [30, 3, "baz", 0.3, 0.03],
            [40, 4, "qux", 0.4, 0.04],
        ]

        df = DataFrame(vals, index=list("abcd"), columns=["1st", "2nd", "3rd", "4th", "5th"])

        self.assertTrue(df._is_mixed_type)
        right = df.copy()

        for orient in ["split", "index", "columns"]:
            inp = df.to_json(orient=orient)
            left = read_json(inp, orient=orient, convert_axes=False)
            assert_frame_equal(left, right)

        right.index = np.arange(len(df))
        inp = df.to_json(orient="records")
        left = read_json(inp, orient="records", convert_axes=False)
        assert_frame_equal(left, right)

        right.columns = np.arange(df.shape[1])
        inp = df.to_json(orient="values")
        left = read_json(inp, orient="values", convert_axes=False)
        assert_frame_equal(left, right)
Example #14
0
    def test_timedelta(self):
        converter = lambda x: pd.to_timedelta(x, unit="ms")

        s = Series([timedelta(23), timedelta(seconds=5)])
        self.assertEqual(s.dtype, "timedelta64[ns]")
        # index will be float dtype
        assert_series_equal(s, pd.read_json(s.to_json(), typ="series").apply(converter), check_index_type=False)

        s = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1], dtype=float))
        self.assertEqual(s.dtype, "timedelta64[ns]")
        assert_series_equal(s, pd.read_json(s.to_json(), typ="series").apply(converter))

        frame = DataFrame([timedelta(23), timedelta(seconds=5)])
        self.assertEqual(frame[0].dtype, "timedelta64[ns]")
        assert_frame_equal(
            frame, pd.read_json(frame.to_json()).apply(converter), check_index_type=False, check_column_type=False
        )

        frame = DataFrame(
            {
                "a": [timedelta(days=23), timedelta(seconds=5)],
                "b": [1, 2],
                "c": pd.date_range(start="20130101", periods=2),
            }
        )

        result = pd.read_json(frame.to_json(date_unit="ns"))
        result["a"] = pd.to_timedelta(result.a, unit="ns")
        result["c"] = pd.to_datetime(result.c)
        assert_frame_equal(frame, result, check_index_type=False)
Example #15
0
    def test_frame_non_unique_columns(self):
        df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 2],
                       columns=['x', 'x'])

        self.assertRaises(ValueError, df.to_json, orient='index')
        self.assertRaises(ValueError, df.to_json, orient='columns')
        self.assertRaises(ValueError, df.to_json, orient='records')

        assert_frame_equal(df, read_json(df.to_json(orient='split'),
                                         orient='split', dtype=False))
        unser = read_json(df.to_json(orient='values'), orient='values')
        tm.assert_numpy_array_equal(df.values, unser.values)

        # GH4377; duplicate columns not processing correctly
        df = DataFrame([['a', 'b'], ['c', 'd']], index=[
                       1, 2], columns=['x', 'y'])
        result = read_json(df.to_json(orient='split'), orient='split')
        assert_frame_equal(result, df)

        def _check(df):
            result = read_json(df.to_json(orient='split'), orient='split',
                               convert_dates=['x'])
            assert_frame_equal(result, df)

        for o in [[['a', 'b'], ['c', 'd']],
                  [[1.5, 2.5], [3.5, 4.5]],
                  [[1, 2.5], [3, 4.5]],
                  [[Timestamp('20130101'), 3.5],
                   [Timestamp('20130102'), 4.5]]]:
            _check(DataFrame(o, index=[1, 2], columns=['x', 'x']))
Example #16
0
    def test_frame_mixedtype_orient(self):  # GH10289
        vals = [[10, 1, 'foo', .1, .01],
                [20, 2, 'bar', .2, .02],
                [30, 3, 'baz', .3, .03],
                [40, 4, 'qux', .4, .04]]

        df = DataFrame(vals, index=list('abcd'),
                       columns=['1st', '2nd', '3rd', '4th', '5th'])

        self.assertTrue(df._is_mixed_type)
        right = df.copy()

        for orient in ['split', 'index', 'columns']:
            inp = df.to_json(orient=orient)
            left = read_json(inp, orient=orient, convert_axes=False)
            assert_frame_equal(left, right)

        right.index = np.arange(len(df))
        inp = df.to_json(orient='records')
        left = read_json(inp, orient='records', convert_axes=False)
        assert_frame_equal(left, right)

        right.columns = np.arange(df.shape[1])
        inp = df.to_json(orient='values')
        left = read_json(inp, orient='values', convert_axes=False)
        assert_frame_equal(left, right)
Example #17
0
  def searchByName(self, searchByName=""):
    searchParameters = 'searchByName=%s' % searchByName
    print searchParameters
    if cherrypy.request.method != 'GET':
      cherrypy.response.headers['Allow'] = 'GET'
      raise cherrypy.HTTPError(405)
    if not searchByName:
      return "Please enter a valid name"
    if self.searchManager(searchParameters):
      cand_ids, cand_comms = self.searches[searchParameters]
      if not cand_ids or not cand_comms:
	return self.noResultsFound()
      cand_ids = pandas.read_json(json.dumps(cand_ids)).to_html()
      cand_comms = pandas.read_json(json.dumps(cand_comms), orient='index').to_html()
      return cand_ids+cand_comms
    else:
      s = self.__SearchLocation(self.__Connection)
      parameters = {'name':searchByName}
      cand_ids, cand_comms = s.search_by_name(parameters)
      if not cand_ids or not cand_comms:
	return self.noResultsFound()
      self.searches[searchParameters] = (cand_ids, cand_comms)
      cand_ids = pandas.read_json(json.dumps(cand_ids)).to_html()
      cand_comms = pandas.read_json(json.dumps(cand_comms), orient='index').to_html()
      cherrypy.response.headers['Content-Type'] = 'text/html'
      return cand_ids+cand_comms
    def get_all_vocabulary(self):
        all_codes, code_types = self.get_meth_codes()

        ## do it this way if you want a non-nested list of all codes
        ## i.e. er_codes = [code1, code2,...]
        ##def get_one_meth_category(category, all_codes, code_types):

        ## do it this way if you want a tiered list of all codes
        ## i.e. er_codes = {'anisotropy_codes': ['code1', 'code2'], ...}
        ##def get_tiered_meth_category(mtype, all_codes, code_types):

        if any(all_codes):
            methods = self.get_tiered_meth_category('other', all_codes, code_types)
            age_methods = self.get_tiered_meth_category('age', all_codes, code_types)
        else:
            methods = self.get_tiered_meth_category_offline()
            age_methods = self.get_tiered_meth_category_offline()
            path = os.path.join(data_model_dir, 'code_types.txt')
            with open(path, 'r') as type_file:
                raw_code_types = json.load(type_file)
            code_types = pd.read_json(raw_code_types)
            path = os.path.join(data_model_dir, 'all_codes.txt')
            with open(path, 'r') as code_file:
                raw_all_codes = json.load(code_file)
            all_codes = pd.read_json(raw_all_codes)

        vocabularies = self.get_controlled_vocabularies()
        suggested = self.get_suggested_vocabularies()
        self.vocabularies = vocabularies
        self.suggested = suggested
        #self.possible_vocabularies = possible_vocabularies
        self.all_codes = all_codes
        self.code_types = code_types
        self.methods = methods
        self.age_methods = age_methods
Example #19
0
    def GET(self):
		web.header('Access-Control-Allow-Origin', '*')
		output = dict()
		getInput = web.input(start='2012-3-03 16:00:00', end='2012-3-03 21:00:00')
		start_time=pd.to_datetime(getInput.start).tz_localize('US/Eastern') - pd.DateOffset(hours=10)
		end_time=pd.to_datetime(getInput.end).tz_localize('US/Eastern') - pd.DateOffset(hours=10)
		
		output_nodes = set()
		all_schedules = pd.read_json('all_schedules.json')
		allnodes = pd.read_json('allnodes.json')
		nodes = set(allnodes.nodes)
		all_schedules['end'] = all_schedules['end'].map(lambda x: datetime.datetime.fromtimestamp(x/1000000000))
		all_schedules['start'] = all_schedules['start'].map(lambda x: datetime.datetime.fromtimestamp(x/1000000000))

		night_sched = all_schedules[(all_schedules.start >= start_time) & (all_schedules.end <= end_time)]
		on_nodes = set()
		for idx,show in night_sched.iterrows():
			on_nodes.add(show[2])
		
		off_nodes = nodes.difference(on_nodes)
		
		imported_graph = nx.read_gexf('./finished_network3.gexf')
		for i in off_nodes:
			try:
				imported_graph.remove_node(i)
			except:
				continue
		
		pr=nx.pagerank(imported_graph,alpha=0.9,weight='newweight',tol=.01, max_iter=200)
		
		output['nodes'] = [(i,v*1000000) for i,v in pr.items()]
		output['input_params'] = getInput
		return json.dumps(output)
def get_plot_data():
    df = pd.read_json(URL_TOTALES_PRESIDENTE)
    totales = df[df.provincia == 99]
    totales.sort('porc_final_agrupacion', ascending=False)

    codigos = totales.codigo_agrupacion.values
    porcentajes = totales.porc_final_agrupacion.astype(float).values
    porcentajes *= 0.01
    porcentajes = list(porcentajes)

    primero, segundo = porcentajes[:2]

    # if primero >= 45:
    #     falta = 0

    falta = max(40 , (segundo + 10)) - primero

    listas = pd.read_json(URL_LISTAS)
    def get_candidatos_lista(c):
        return listas[listas.codigo == c].siglas.values[0]

    fuerzas = [get_candidatos_lista(c) for c in codigos]
    candidatos =  [CANDIDATOS[f] for f in fuerzas]
    
    data = (candidatos, (porcentajes, falta))
    json_dump_unicode(data, "databokeh.json")
    
    return data
Example #21
0
def main():
    with open('data.txt', 'r') as f:
        #print f.read()
        #di = json.loads(f.read())
        data_2007 = pd.read_json(f.read())
        cols = ['tmp', 'thunder', 'press', 'hum', 'date', 'rain']
        train_y = data_2007['sales'] 
        train_x = data_2007[cols]
            
        print train_x.shape
        clf_lr = RandomForestClassifier(n_estimators=50, n_jobs=-1)
        clf_lr.fit(train_x, train_y)
        with open('data2.txt', 'r') as f2:
            data_2008 = pd.read_json(f2.read())
        pr = clf_lr.predict(data_2008[['tmp', 'thunder', 'press', 'hum', 'date', 'rain']])
        print pr
        with open('data2.txt', 'r') as f:
            elems = json.loads(f.read())
                
            m=0
            for i in pr:
                elems[m]["sales"]= i
                m+=1
        
        l = [[int(time.mktime(datetime.datetime.strptime(i["date"], "%Y%m%d").timetuple()))*1000,i["sales"]] for i in elems]
        #y = [i["sales"] for i in elems]
        print l
Example #22
0
def test_readjson_chunks_from_file():
    with ensure_clean('test.json') as path:
        df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
        df.to_json(path, lines=True, orient="records")
        chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
        unchunked = pd.read_json(path, lines=True)
        assert_frame_equal(unchunked, chunked)
def gatherStats3(fileLoc, force=False):
	dirName = fileLoc.split("/")[-2]
	outFilename = PROCESSED_FILE_LOC + "stats_" + dirName + ".json"

	if force or not isfile(outFilename):
		startYear = int(dirName.split("_")[1])
		endYear = int(dirName.split("_")[2])

		fileList = listdir(fileLoc)
		print "fileListLen:", len(fileList)

		stats = pd.DataFrame(index = range(startYear, endYear+1))
		for f in fileList: 
			path = join(fileLoc, f)
			if isfile(path) and getsize(path) > 0 and f!='dummy.json':
				quotes = pd.read_json(path)
				quotes['Year'] = quotes['Date'].apply(lambda x: x.year)
				grp = quotes[['Year', 'Date']].groupby(['Year']).count()

				symbol = f.split('.')[0]
				s = pd.Series(0, index=stats.index)
				for year in grp.index:
					s[year] = grp.loc[year]['Date']

				stats[symbol] = s

		stats.to_json(outFilename)
	else:
		stats = pd.read_json(outFilename)

	return stats
Example #24
0
    def test_timedelta(self):
        converter = lambda x: pd.to_timedelta(x, unit='ms')

        s = Series([timedelta(23), timedelta(seconds=5)])
        self.assertEqual(s.dtype, 'timedelta64[ns]')

        result = pd.read_json(s.to_json(), typ='series').apply(converter)
        assert_series_equal(result, s)

        s = Series([timedelta(23), timedelta(seconds=5)],
                   index=pd.Index([0, 1]))
        self.assertEqual(s.dtype, 'timedelta64[ns]')
        result = pd.read_json(s.to_json(), typ='series').apply(converter)
        assert_series_equal(result, s)

        frame = DataFrame([timedelta(23), timedelta(seconds=5)])
        self.assertEqual(frame[0].dtype, 'timedelta64[ns]')
        assert_frame_equal(frame, pd.read_json(frame.to_json())
                           .apply(converter))

        frame = DataFrame({'a': [timedelta(days=23), timedelta(seconds=5)],
                           'b': [1, 2],
                           'c': pd.date_range(start='20130101', periods=2)})

        result = pd.read_json(frame.to_json(date_unit='ns'))
        result['a'] = pd.to_timedelta(result.a, unit='ns')
        result['c'] = pd.to_datetime(result.c)
        assert_frame_equal(frame, result)
Example #25
0
def test_read_zipped_json():
    uncompressed_path = tm.get_data_path("tsframe_v012.json")
    uncompressed_df = pd.read_json(uncompressed_path)

    compressed_path = tm.get_data_path("tsframe_v012.json.zip")
    compressed_df = pd.read_json(compressed_path, compression='zip')

    assert_frame_equal(uncompressed_df, compressed_df)
def add_new_url_list(new):
    old = pn.read_json("urls_linkedin.json")
    new = pn.read_json(new)
    data = old.append(new)
    data.drop_duplicates(inplace=True,take_last=True)
    data.reset_index(drop=True,inplace=True)
    data.to_json("urls_linkedin.json")
    return "new urls added"
Example #27
0
    def test_series_non_unique_index(self):
        s = Series(["a", "b"], index=[1, 1])

        self.assertRaises(ValueError, s.to_json, orient="index")

        assert_series_equal(s, read_json(s.to_json(orient="split"), orient="split", typ="series"))
        unser = read_json(s.to_json(orient="records"), orient="records", typ="series")
        np.testing.assert_equal(s.values, unser.values)
Example #28
0
def test_read_zipped_json(datapath):
    uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
    uncompressed_df = pd.read_json(uncompressed_path)

    compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
    compressed_df = pd.read_json(compressed_path, compression='zip')

    assert_frame_equal(uncompressed_df, compressed_df)
Example #29
0
def test_chunksize_with_compression(compression):
    with tm.ensure_clean() as path:
        df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}')
        df.to_json(path, orient='records', lines=True, compression=compression)

        roundtripped_df = pd.concat(pd.read_json(path, lines=True, chunksize=1,
                                                 compression=compression))
        assert_frame_equal(df, roundtripped_df)
Example #30
0
def pred_hotstar():
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier

    df_train = pd.read_json("~/Documents/dataset/hotstar/train_data.json")
    df_test = pd.read_json("~/Documents/dataset/hotstar/test_data.json")

    df_train.head()
from sklearn.feature_extraction.text import TfidfVectorizer

if ((len(sys.argv)) == 1):
    sys.exit("test or train arg is missing.")

test_train = sys.argv[1].strip().lower()
if (test_train != 'test' and test_train != 'train'):
    sys.exit("Invalid argument passed.")

dataset_dir = 'data/{}.json'.format(test_train)
output_dir = 'data/{}/'.format(test_train)

stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

data = pd.read_json(dataset_dir, convert_dates=['created'])


def parse_text(text):
    soup = BeautifulSoup(desc)
    return ' ' + soup.get_text()


def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    words = []
    for token in tokens:
        if (token.isalpha()):
            words.append(porter.stem(token))
    cleaned_text = ' '.join(words)
Example #32
0
def load_energy():
    """Loads an energy related dataset to use with sankey and graphs"""
    tbl_name = 'energy_usage'
    with gzip.open(os.path.join(DATA_FOLDER, 'energy.json.gz')) as f:
        pdf = pd.read_json(f)
    pdf.to_sql(tbl_name,
               db.engine,
               if_exists='replace',
               chunksize=500,
               dtype={
                   'source': String(255),
                   'target': String(255),
                   'value': Float(),
               },
               index=False)

    print("Creating table [wb_health_population] reference")
    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
    if not tbl:
        tbl = TBL(table_name=tbl_name)
    tbl.description = "Energy consumption"
    tbl.is_featured = True
    tbl.database = get_or_create_main_db()
    db.session.merge(tbl)
    db.session.commit()
    tbl.fetch_metadata()

    slc = Slice(slice_name="Energy Sankey",
                viz_type='sankey',
                datasource_type='table',
                datasource_id=tbl.id,
                params=textwrap.dedent("""\
        {
            "collapsed_fieldsets": "",
            "datasource_id": "3",
            "datasource_name": "energy_usage",
            "datasource_type": "table",
            "flt_col_0": "source",
            "flt_eq_0": "",
            "flt_op_0": "in",
            "groupby": [
                "source",
                "target"
            ],
            "having": "",
            "metric": "sum__value",
            "row_limit": "5000",
            "slice_name": "Energy Sankey",
            "viz_type": "sankey",
            "where": ""
        }
        """))
    misc_dash_slices.append(slc.slice_name)
    merge_slice(slc)

    slc = Slice(slice_name="Energy Force Layout",
                viz_type='directed_force',
                datasource_type='table',
                datasource_id=tbl.id,
                params=textwrap.dedent("""\
        {
            "charge": "-500",
            "collapsed_fieldsets": "",
            "datasource_id": "1",
            "datasource_name": "energy_usage",
            "datasource_type": "table",
            "flt_col_0": "source",
            "flt_eq_0": "",
            "flt_op_0": "in",
            "groupby": [
                "source",
                "target"
            ],
            "having": "",
            "link_length": "200",
            "metric": "sum__value",
            "row_limit": "5000",
            "slice_name": "Force",
            "viz_type": "directed_force",
            "where": ""
        }
        """))
    misc_dash_slices.append(slc.slice_name)
    merge_slice(slc)

    slc = Slice(slice_name="Heatmap",
                viz_type='heatmap',
                datasource_type='table',
                datasource_id=tbl.id,
                params=textwrap.dedent("""\
        {
            "all_columns_x": "source",
            "all_columns_y": "target",
            "canvas_image_rendering": "pixelated",
            "collapsed_fieldsets": "",
            "datasource_id": "1",
            "datasource_name": "energy_usage",
            "datasource_type": "table",
            "flt_col_0": "source",
            "flt_eq_0": "",
            "flt_op_0": "in",
            "having": "",
            "linear_color_scheme": "blue_white_yellow",
            "metric": "sum__value",
            "normalize_across": "heatmap",
            "slice_name": "Heatmap",
            "viz_type": "heatmap",
            "where": "",
            "xscale_interval": "1",
            "yscale_interval": "1"
        }
        """))
    misc_dash_slices.append(slc.slice_name)
    merge_slice(slc)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

df = pd.read_json('amsterdam_sold_geo_4pp.json')
df.reset_index(drop=True)

df['price per unit area'] = df['price'] / df['area']

df['postdate'] = pd.to_datetime(df['posting_date'], dayfirst=True)
df['saledate'] = pd.to_datetime(df['sale_date'], dayfirst=True)

# print(df['saledate'] - df['postdate'])
df['time_to_sell'] = df['saledate'] - df['postdate']

dfp = df[df['price'] > 50000]
dfp['days_to_sell'] = dfp['time_to_sell'].apply(lambda x: x.days)
dfp = dfp[dfp['days_to_sell'] > 0]

dfp_week = dfp.groupby(pd.TimeGrouper(key='saledate', freq='W')).mean()

fig, (ax1, ax2) = plt.subplots(2, sharex=True)
ax1.scatter(dfp['saledate'].values, dfp['price'] / 1000.0, s=10)
ax1.plot_date(dfp_week.index, dfp_week['price'] / 1000.0, 'r-', linewidth=2)
ax1.set_xlim([datetime.datetime(2015, 4, 1), datetime.datetime(2016, 8, 1)])
ax1.set_xlabel('Date of sale')
ax1.set_ylabel(u'Asking price (1,000 EUR)')
ax1.set_title('Property sales in Amsterdam')
ax1.set_ylim([0, 800])
ax1.grid('on')
Example #34
0
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 20 14:09:50 2015

@author: btrani
"""

import numpy as np
import pandas as pd
import datetime
import urllib

# Read in our data. We've aggregated it by date already, so we don't need to worry about paging
query = (
    'https://data.smgov.net/resource/xx64-wi4x.json?$$app_token=g5iIFV3PzVEgEGqkxekFlTlxW'
)
raw_data = pd.read_json(query)
Example #35
0
    def extractFtpData(self):
        print('\nExtracting FTP Glofas Data\n')

        files = [
            f for f in listdir(self.inputPath)
            if isfile(join(self.inputPath, f)) and f.endswith('.nc')
        ]

        df_thresholds = pd.read_json(json.dumps(self.GLOFAS_STATIONS))
        df_thresholds = df_thresholds.set_index("stationCode", drop=False)
        df_district_mapping = pd.read_json(json.dumps(self.DISTRICT_MAPPING))
        df_district_mapping = df_district_mapping.set_index("glofasStation",
                                                            drop=False)

        stations = []
        trigger_per_day = {
            1: 0,
            2: 0,
            3: 0,
            4: 0,
            5: 0,
            6: 0,
            7: 0,
        }
        for i in range(0, len(files)):
            logging.info("Extracting glofas data from %s", i)
            Filename = os.path.join(self.inputPath, files[i])
            station = {}
            station['code'] = files[i].split('_')[2]

            data = xr.open_dataset(Filename)

            # Get threshold for this specific station
            if station['code'] in df_thresholds['stationCode'] and station[
                    'code'] in df_district_mapping['glofasStation']:
                print(Filename)
                threshold = df_thresholds[df_thresholds['stationCode'] ==
                                          station['code']][TRIGGER_LEVEL][0]

                # Set dimension-values
                time = 0

                for step in range(1, 8):

                    # Loop through 51 ensembles, get forecast and compare to threshold
                    ensemble_options = 51
                    count = 0
                    dis_sum = 0
                    for ensemble in range(0, ensemble_options):

                        discharge = data['dis'].sel(ensemble=ensemble,
                                                    step=step).values[time][0]

                        # DUMMY OVERWRITE DEPENDING ON COUNTRY SETTING
                        if SETTINGS_SECRET[self.country_code]['mock'] == True:
                            if SETTINGS_SECRET[self.country_code][
                                    'if_mock_trigger'] == True:
                                if step < 5:  # Only dummy trigger for 5-day and above
                                    discharge = 0
                                elif station[
                                        'code'] == 'G1361':  # ZMB dummy flood station 1
                                    discharge = 8000
                                elif station[
                                        'code'] == 'G1328':  # ZMB dummy flood station 2
                                    discharge = 9000
                                else:
                                    discharge = 0
                            else:
                                discharge = 0

                        if discharge >= threshold:
                            count = count + 1
                        dis_sum = dis_sum + discharge

                    prob = count / ensemble_options
                    dis_avg = dis_sum / ensemble_options
                    station['fc'] = dis_avg
                    station['fc_prob'] = prob
                    station['fc_trigger'] = 1 if prob > TRIGGER_LEVELS[
                        'minimum'] else 0

                    if station['fc_trigger'] == 1:
                        trigger_per_day[step] = 1

                    if step == self.leadTimeValue:
                        stations.append(station)
                    station = {}
                    station['code'] = files[i].split('_')[2]

            data.close()

        # Add 'no_station'
        for station_code in ['no_station']:
            station = {}
            station['code'] = station_code
            station['fc'] = 0
            station['fc_prob'] = 0
            station['fc_trigger'] = 0
            stations.append(station)

        with open(self.extractedGlofasPath, 'w') as fp:
            json.dump(stations, fp)
            print('Extracted Glofas data - File saved')

        with open(self.triggerPerDay, 'w') as fp:
            json.dump([trigger_per_day], fp)
            print('Extracted Glofas data - Trigger per day File saved')
Example #36
0
import pydeck as pdk
import pandas as pd

GPU_GRID_LAYER_DATA = ("https://raw.githubusercontent.com/uber-common/"
                       "deck.gl-data/master/website/sf-bike-parking.json")
df = pd.read_json(GPU_GRID_LAYER_DATA)

# Define a layer to display on a map
layer = pdk.Layer(
    "GPUGridLayer",
    df,
    pickable=True,
    extruded=True,
    cellSize=200,
    elevation_scale=4,
    get_position="COORDINATES",
)

# Set the viewport location
view_state = pdk.ViewState(latitude=37.7749295,
                           longitude=-122.4194155,
                           zoom=11,
                           bearing=0,
                           pitch=45)

# Render
r = pdk.Deck(
    layers=[layer],
    initial_view_state=view_state,
    tooltip={"text": "{position}\nCount: {count}"},
)
Example #37
0
File: io.py Project: xrmx/modin
    def read_json(
        cls,
        path_or_buf=None,
        orient=None,
        typ="frame",
        dtype=True,
        convert_axes=True,
        convert_dates=True,
        keep_default_dates=True,
        numpy=False,
        precise_float=False,
        date_unit=None,
        encoding=None,
        lines=False,
        chunksize=None,
        compression="infer",
    ):
        kwargs = {
            "path_or_buf": path_or_buf,
            "orient": orient,
            "typ": typ,
            "dtype": dtype,
            "convert_axes": convert_axes,
            "convert_dates": convert_dates,
            "keep_default_dates": keep_default_dates,
            "numpy": numpy,
            "precise_float": precise_float,
            "date_unit": date_unit,
            "encoding": encoding,
            "lines": lines,
            "chunksize": chunksize,
            "compression": compression,
        }
        if cls.read_json_remote_task is None:
            return super(RayIO, cls).read_json(**kwargs)

        if not lines:
            ErrorMessage.default_to_pandas(
                "`read_json` only optimized with `lines=True`"
            )
            return super(RayIO, cls).read_json(**kwargs)
        else:
            # TODO: Pick up the columns in an optimized way from all data
            # All rows must be read because some rows may have missing data
            # Currently assumes all rows have the same columns
            from io import BytesIO

            columns = pandas.read_json(
                BytesIO(b"" + open(path_or_buf, "rb").readline()), lines=True
            ).columns
            kwargs["columns"] = columns
            empty_pd_df = pandas.DataFrame(columns=columns)

            path_or_buf = kwargs.pop("path_or_buf")

            with file_open(path_or_buf, "rb", kwargs.get("compression", "infer")) as f:
                total_bytes = file_size(f)
                from modin.pandas import DEFAULT_NPARTITIONS

                num_partitions = DEFAULT_NPARTITIONS
                num_splits = min(len(columns), num_partitions)
                chunk_size = max(1, (total_bytes - f.tell()) // num_partitions)

                partition_ids = []
                index_ids = []
                dtypes_ids = []

                column_chunksize = compute_chunksize(empty_pd_df, num_splits, axis=1)
                if column_chunksize > len(columns):
                    column_widths = [len(columns)]
                    num_splits = 1
                else:
                    column_widths = [
                        column_chunksize
                        if i != num_splits - 1
                        else len(columns) - (column_chunksize * (num_splits - 1))
                        for i in range(num_splits)
                    ]

                while f.tell() < total_bytes:
                    start = f.tell()
                    f.seek(chunk_size, os.SEEK_CUR)
                    f.readline()
                    partition_id = cls.read_json_remote_task._remote(
                        args=(path_or_buf, num_splits, start, f.tell(), kwargs),
                        num_return_vals=num_splits + 3,
                    )
                    partition_ids.append(partition_id[:-3])
                    index_ids.append(partition_id[-3])
                    dtypes_ids.append(partition_id[-2])

            row_lengths = ray.get(index_ids)
            new_index = pandas.RangeIndex(sum(row_lengths))

            dtypes = (
                pandas.concat(ray.get(dtypes_ids), axis=1)
                .apply(lambda row: find_common_type(row.values), axis=1)
                .squeeze(axis=0)
            )

            partition_ids = [
                [
                    cls.frame_partition_cls(
                        partition_ids[i][j],
                        length=row_lengths[i],
                        width=column_widths[j],
                    )
                    for j in range(len(partition_ids[i]))
                ]
                for i in range(len(partition_ids))
            ]

            if isinstance(dtypes, pandas.Series):
                dtypes.index = columns
            else:
                dtypes = pandas.Series(dtypes, index=columns)

            new_frame = cls.frame_cls(
                np.array(partition_ids),
                new_index,
                columns,
                row_lengths,
                column_widths,
                dtypes=dtypes,
            )
            new_frame._apply_index_objs(axis=0)
            return cls.query_compiler_cls(new_frame)
Example #38
0
def update_font(p_dfs, lang):
    start_time = time.time()
    pages = len(p_dfs)
    new_dfs = []

    try:
        for page_index in range(pages):
            page_df = p_dfs[page_index]
            page_df = page_df.where(page_df.notnull(), None)
            page_lis = []
            child_lis = []

            for index, row in page_df.iterrows():
                if row['children'] == None:
                    page_lis.append(change_font(row["font_family"], lang))
                    child_lis.append(row['children'])
                else:
                    sub_block_children = pd.read_json(row['children'])
                    sub_block_children = sub_block_children.where(
                        sub_block_children.notnull(), None)
                    page_lis1 = []
                    child_lis1 = []
                    for index2, row2 in sub_block_children.iterrows():
                        if row2['children'] == None:
                            child_lis1.append(row2['children'])
                            page_lis1.append(
                                change_font(row2["font_family"], lang))
                        else:
                            sub2_block_children = pd.read_json(
                                row2['children'])
                            sub2_block_children = sub2_block_children.where(
                                sub2_block_children.notnull(), None)
                            page_lis2 = []
                            for index3, row3 in sub2_block_children.iterrows():
                                page_lis2.append(
                                    change_font(row3["font_family"], lang))

                            sub2_block_children['font_family'] = page_lis2
                            #print(sub2_block_children)
                            page_lis1.append(
                                max(set(page_lis2), key=page_lis2.count))
                            child_lis1.append(sub2_block_children.to_json())
                            #print(child_lis1)

                    sub_block_children['font_family'] = page_lis1
                    sub_block_children['children'] = child_lis1

                    page_lis.append(max(set(page_lis1), key=page_lis1.count))
                    child_lis.append(sub_block_children.to_json())

            page_df['font_family'] = page_lis
            page_df['children'] = child_lis
            new_dfs.append(page_df)

        end_time = time.time()
        extraction_time = end_time - start_time
        log_info('Updating of fonts completed in {}'.format(extraction_time),
                 app_context.application_context)

    except Exception as e:
        log_error('Error in updating fonts', app_context.application_context,
                  e)
        return None

    return new_dfs
Example #39
0
    RMSErbarsum = 0
    it = 0

    conn = comm.openServerConn(args.backend_port, args.backend_ip)

    batch_id = 0
    while True:
        batch_id += 1
        print("Waiting for new batch...")
        try:
            comm.sendMessage(conn, "ACK")
        except BrokenPipeError:
            print("Connection with frontend terminated!")
            break
        msg = comm.getMessage(conn, isJSON=True)  # fetch data from frontend
        mbatch = pd.read_json(msg)
        """Batching phase"""
        start = time.time()
        for index, row in mbatch.iterrows():
            '''batch1: Get new observation (user, item, rating, timestamp)'''
            user = int(row['user'])
            item_id = int(row['item'])
            rating = float(row['rating'])
            timestamp = int(row['unixtimestamp'])
            '''batch2: Add it to fresh_ratings dictionary'''
            if user in tempTimestep: tempTimestep[user] += 1
            else: tempTimestep[user] = 1
            fresh_ratings[(user, item_id)] = (rating, tempTimestep[user])

            if rmse:
                '''batch3a: Predict'''
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 19 13:09:26 2019

Applied Statistical Methods Final Project
Data Processing/ Formatting Code 

@author: Ellie Frith
"""


import pandas as pd
import numpy as np
import re

pompeo_df = pd.read_json('sec_pompeo_tweets.json')
trump_df = pd.read_json('trump_tweets.json')


#Response from Twitter API has lots of extraneous information and dense 
#dictionaries: want to extract the information useful for this analysis 
  

def media_type(extended_entities):
    
    '''This function extracts the media type of any media attached to
    a tweet (image, video, etc). If tweet has no media, the function
    will return a value of nan. '''
    
    if pd.isnull(extended_entities)==True:
        media = np.nan
Example #41
0
def load_multiformat_time_series_data():
    """Loading time series data from a zip file in the repo"""
    with gzip.open(os.path.join(DATA_FOLDER,
                                'multiformat_time_series.json.gz')) as f:
        pdf = pd.read_json(f)
    pdf.ds = pd.to_datetime(pdf.ds, unit='s')
    pdf.ds2 = pd.to_datetime(pdf.ds2, unit='s')
    pdf.to_sql('multiformat_time_series',
               db.engine,
               if_exists='replace',
               chunksize=500,
               dtype={
                   "ds": Date,
                   'ds2': DateTime,
                   "epoch_s": BigInteger,
                   "epoch_ms": BigInteger,
                   "string0": String(100),
                   "string1": String(100),
                   "string2": String(100),
                   "string3": String(100),
               },
               index=False)
    print("Done loading table!")
    print("-" * 80)
    print("Creating table [multiformat_time_series] reference")
    obj = db.session.query(TBL).filter_by(
        table_name='multiformat_time_series').first()
    if not obj:
        obj = TBL(table_name='multiformat_time_series')
    obj.main_dttm_col = 'ds'
    obj.database = get_or_create_main_db()
    obj.is_featured = False
    dttm_and_expr_dict = {
        'ds': [None, None],
        'ds2': [None, None],
        'epoch_s': ['epoch_s', None],
        'epoch_ms': ['epoch_ms', None],
        'string2': ['%Y%m%d-%H%M%S', None],
        'string1': ['%Y-%m-%d^%H:%M:%S', None],
        'string0': ['%Y-%m-%d %H:%M:%S.%f', None],
        'string3': ['%Y/%m/%d%H:%M:%S.%f', None],
    }
    for col in obj.columns:
        dttm_and_expr = dttm_and_expr_dict[col.column_name]
        col.python_date_format = dttm_and_expr[0]
        col.dbatabase_expr = dttm_and_expr[1]
        col.is_dttm = True
    db.session.merge(obj)
    db.session.commit()
    obj.fetch_metadata()
    tbl = obj

    print("Creating some slices")
    for i, col in enumerate(tbl.columns):
        slice_data = {
            "granularity_sqla": col.column_name,
            "datasource_id": "8",
            "datasource_name": "multiformat_time_series",
            "datasource_type": "table",
            "granularity": "day",
            "row_limit": config.get("ROW_LIMIT"),
            "since": "1 year ago",
            "until": "now",
            "where": "",
            "viz_type": "cal_heatmap",
            "domain_granularity": "month",
            "subdomain_granularity": "day",
        }

        slc = Slice(
            slice_name="Calendar Heatmap multiformat " + str(i),
            viz_type='cal_heatmap',
            datasource_type='table',
            datasource_id=tbl.id,
            params=get_slice_json(slice_data),
        )
        merge_slice(slc)
    misc_dash_slices.append(slc.slice_name)
import os
import glob
import psycopg2
import pandas as pd
from sql_queries import *


def process_song_file(cur, filepath):
     """Reads songs log file row by row, selects needed fields and inserts them into song and artist tables.
        Parameters:
            cur (psycopg2.cursor()): Cursor of the sparkifydb database
            filepath (str): Filepath of the file to be analyzed
    """
    # open song file
    df = pd.read_json(filepath, lines=True)

    # insert song record
    song_data = df[["song_id", "title", "artist_id", "year", "duration"]].values[0].tolist()
    cur.execute(song_table_insert, song_data)
    
    # insert artist record
    artist_data = df[["artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude"]].values[0].tolist()
    cur.execute(artist_table_insert, artist_data)


def process_log_file(cur, filepath):
    """
        This function reads Log files and reads information of time, user and songplay data and saves into time, user, songplay
        Arguments:
        cur: Database Cursor
        filepath: location of Log files
def json2dtf(jsondata):
    '''
    get json as pandas
    '''
    dtf = pd.read_json(jsondata)
    return dtf
Example #44
0
def load_birth_names():
    """Loading birth name dataset from a zip file in the repo"""
    with gzip.open(os.path.join(DATA_FOLDER, 'birth_names.json.gz')) as f:
        pdf = pd.read_json(f)
    pdf.ds = pd.to_datetime(pdf.ds, unit='ms')
    pdf.to_sql('birth_names',
               db.engine,
               if_exists='replace',
               chunksize=500,
               dtype={
                   'ds': DateTime,
                   'gender': String(16),
                   'state': String(10),
                   'name': String(255),
               },
               index=False)
    l = []
    print("Done loading table!")
    print("-" * 80)

    print("Creating table [birth_names] reference")
    obj = db.session.query(TBL).filter_by(table_name='birth_names').first()
    if not obj:
        obj = TBL(table_name='birth_names')
    obj.main_dttm_col = 'ds'
    obj.database = get_or_create_main_db()
    obj.is_featured = True
    db.session.merge(obj)
    db.session.commit()
    obj.fetch_metadata()
    tbl = obj

    defaults = {
        "compare_lag": "10",
        "compare_suffix": "o10Y",
        "datasource_id": "1",
        "datasource_name": "birth_names",
        "datasource_type": "table",
        "flt_op_1": "in",
        "limit": "25",
        "granularity": "ds",
        "groupby": [],
        "metric": 'sum__num',
        "metrics": ["sum__num"],
        "row_limit": config.get("ROW_LIMIT"),
        "since": "100 years ago",
        "until": "now",
        "viz_type": "table",
        "where": "",
        "markup_type": "markdown",
    }

    print("Creating some slices")
    slices = [
        Slice(slice_name="Girls",
              viz_type='table',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    groupby=['name'],
                                    flt_col_1='gender',
                                    flt_eq_1="girl",
                                    row_limit=50)),
        Slice(slice_name="Boys",
              viz_type='table',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    groupby=['name'],
                                    flt_col_1='gender',
                                    flt_eq_1="boy",
                                    row_limit=50)),
        Slice(slice_name="Participants",
              viz_type='big_number',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    viz_type="big_number",
                                    granularity="ds",
                                    compare_lag="5",
                                    compare_suffix="over 5Y")),
        Slice(slice_name="Genders",
              viz_type='pie',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    viz_type="pie",
                                    groupby=['gender'])),
        Slice(slice_name="Genders by State",
              viz_type='dist_bar',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(
                  defaults,
                  flt_eq_1="other",
                  viz_type="dist_bar",
                  metrics=['sum__sum_girls', 'sum__sum_boys'],
                  groupby=['state'],
                  flt_op_1='not in',
                  flt_col_1='state')),
        Slice(slice_name="Trends",
              viz_type='line',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    viz_type="line",
                                    groupby=['name'],
                                    granularity='ds',
                                    rich_tooltip='y',
                                    show_legend='y')),
        Slice(slice_name="Average and Sum Trends",
              viz_type='dual_line',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    viz_type="dual_line",
                                    metric='avg__num',
                                    metric_2='sum__num',
                                    granularity='ds')),
        Slice(slice_name="Title",
              viz_type='markup',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    viz_type="markup",
                                    markup_type="html",
                                    code="""\
<div style="text-align:center">
    <h1>Birth Names Dashboard</h1>
    <p>
        The source dataset came from
        <a href="https://github.com/hadley/babynames">[here]</a>
    </p>
    <img src="/static/assets/images/babytux.jpg">
</div>
""")),
        Slice(slice_name="Name Cloud",
              viz_type='word_cloud',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    viz_type="word_cloud",
                                    size_from="10",
                                    series='name',
                                    size_to="70",
                                    rotation="square",
                                    limit='100')),
        Slice(slice_name="Pivot Table",
              viz_type='pivot_table',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    viz_type="pivot_table",
                                    metrics=['sum__num'],
                                    groupby=['name'],
                                    columns=['state'])),
        Slice(slice_name="Number of Girls",
              viz_type='big_number_total',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    viz_type="big_number_total",
                                    granularity="ds",
                                    flt_col_1='gender',
                                    flt_eq_1='girl',
                                    subheader='total female participants')),
    ]
    for slc in slices:
        merge_slice(slc)

    print("Creating a dashboard")
    dash = db.session.query(Dash).filter_by(dashboard_title="Births").first()

    if not dash:
        dash = Dash()
    js = textwrap.dedent("""\
    [
        {
            "col": 9,
            "row": 6,
            "size_x": 2,
            "size_y": 4,
            "slice_id": "1267"
        },
        {
            "col": 11,
            "row": 6,
            "size_x": 2,
            "size_y": 4,
            "slice_id": "1268"
        },
        {
            "col": 1,
            "row": 0,
            "size_x": 2,
            "size_y": 2,
            "slice_id": "1269"
        },
        {
            "col": 3,
            "row": 0,
            "size_x": 2,
            "size_y": 2,
            "slice_id": "1270"
        },
        {
            "col": 5,
            "row": 3,
            "size_x": 8,
            "size_y": 3,
            "slice_id": "1271"
        },
        {
            "col": 1,
            "row": 6,
            "size_x": 8,
            "size_y": 4,
            "slice_id": "1272"
        },
        {
            "col": 10,
            "row": 0,
            "size_x": 3,
            "size_y": 3,
            "slice_id": "1273"
        },
        {
            "col": 5,
            "row": 0,
            "size_x": 5,
            "size_y": 3,
            "slice_id": "1274"
        },
        {
            "col": 1,
            "row": 2,
            "size_x": 4,
            "size_y": 4,
            "slice_id": "1275"
        }
    ]
        """)
    l = json.loads(js)
    for i, pos in enumerate(l):
        pos['slice_id'] = str(slices[i].id)
    dash.dashboard_title = "Births"
    dash.position_json = json.dumps(l, indent=4)
    dash.slug = "births"
    dash.slices = slices[:-1]
    db.session.merge(dash)
    db.session.commit()
import pandas as pd
import matplotlib, openblender, wordcloud

action = 'API_getObservationsFromDataset'
parameters = {
    'token': 'YOUR_TOKEN_HERE',
    'id_dataset': '5e6ac97595162921fda18076',
    'date_filter': {
        "start_date": "2020-01-01T06:00:00.000Z",
        "end_date": "2020-03-11T06:00:00.000Z"
    },
}

df_confirmed = pd.read_json(json.dumps(
    OpenBlender.call(action, parameters)['sample']),
                            convert_dates=False,
                            convert_axes=False).sort_values('timestamp',
                                                            ascending=False)
df_confirmed.reset_index(drop=True, inplace=True)
df_confirmed.head(10)
Example #46
0
def plot_history(history,
                 metric,
                 orient='index',
                 validation=False,
                 title='Metric Function',
                 xlabel='epochs',
                 ylabel='metric',
                 logx=False,
                 logy=False,
                 smooth=None,
                 smooth_window=10,
                 alpha=None,
                 legend='auto',
                 bbox_to_anchor=(1.0, 0.0),
                 anchor='lower left',
                 base_size=(640, 480),
                 base_ratio=(1, 1),
                 subplots=None,
                 return_ax=False,
                 out_name=None,
                 dpi=72,
                 **kwargs):
    '''
    Plot the a metric or loss function from a Keras history.
    
    Required arguments:
        history: location of the JSON file containing the history of the training,
        metric:  the name of the metric to plot or list of metrics,.
    
    Optional arguments:
        orient:         orientation of the JSON file (for Pandas),
        out_name:       name of the ouput (without extension),
        root:           root of the save location,
        validation:     plot validation loss,
        title:          title of the plot,
        logx:           use log scale on x-axis,
        logy:           use log scale on y-axis,
        smooth:         smoothen the behaviour by considering a moving average,
        smooth_window:  width of the smoothing window,
        alpha:          transparency factor,
        legend:         type of legend to draw (https://seaborn.pydata.org/generated/seaborn.scatterplot.html#seaborn.scatterplot),
        bbox_to_anchor: position of the legend,
        anchor:         anchor of the legend,
        base_size:      tuple with the size of the output,
        base_ratio:     ratio of the output plot (tuple),
        subplots:       pass a tuple of (figure, axis) to use existing axis,
        return_ax:      return the axis object if requested,
        out_name:       name of the ouput (without extension),
        **kwargs:       additional arguments to pass to savefig.
    '''

    # open JSON file
    if isinstance(history, str):
        hst = pd.read_json(history, orient=orient)
    else:
        hst = pd.DataFrame(history)

    # select data
    if not isinstance(metric, list):
        metric = [metric]

    # plot the function
    if subplots is not None:
        fig, ax = subplots
    else:
        X, Y = base_ratio
        fig, ax = plt.subplots(1,
                               1,
                               figsize=ratio(base_ratio[0],
                                             base_ratio[1],
                                             base=base_size,
                                             dpi=dpi),
                               dpi=dpi)

    # define the palettes
    plot_palette_train = ['tab:blue', 'tab:cyan', 'tab:purple', 'tab:grey']
    plot_palette_val = ['tab:red', 'tab:orange', 'tab:pink', 'tab:olive']

    # plot the metrics
    plot_metrics = []
    plot_palette = {}
    plot_dashes = {}
    plot_hst = {}
    for n, m in enumerate(metric):
        plot_metrics.append(m)
        plot_palette[m] = plot_palette_train[n]
        plot_dashes[m] = '-'

        # smooth if requested
        if smooth:
            if 1 < smooth_window < np.shape(hst[m])[0]:
                plot_hst[m] = np.convolve(hst[m],
                                          np.ones(smooth_window) /
                                          smooth_window,
                                          mode='valid')
        else:
            plot_hst[m] = hst[m]

        if validation:
            plot_metrics.append('val_' + m)
            plot_palette['val_' + m] = plot_palette_val[n]
            plot_dashes['val_' + m] = '--'

            # smooth if requested
            if smooth:
                if 1 < smooth_window < np.shape(hst['val_' + m])[0]:
                    plot_hst['val_' + m] = np.convolve(hst['val_' + m],
                                                       np.ones(smooth_window) /
                                                       smooth_window,
                                                       mode='valid')
            else:
                plot_hst['val_' + m] = hst['val_' + m]

    # plot the new metrics
    plot_hst = pd.DataFrame(plot_hst)
    for var in plot_metrics:
        lineplot = {
            'data': plot_hst,
            'x': plot_hst.index,
            'y': var,
            'alpha': alpha,
            'linestyle': plot_dashes[var],
            'color': plot_palette[var],
            'legend': legend
        }
        sns.lineplot(**lineplot, ax=ax)

    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if len(plot_metrics) > 1:
        ax.legend(labels=plot_metrics,
                  bbox_to_anchor=bbox_to_anchor,
                  loc=anchor)

    if logx:
        ax.set_xscale('log')
    if logy:
        ax.set_yscale('log')

    # save the figure
    if out_name is not None:
        savefig(out_name, fig, dpi=dpi, **kwargs)

    if return_ax:
        return ax
Example #47
0
Anthony Ralston - [email protected]

"""

# Import required libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json

# Read json data & prepare dataframe
print("Reading json data & preparing dataframes...")
training_dataframe = pd.read_json('train.json').set_index('id')
testing_dataframe = pd.read_json('test.json').set_index('id')
training_df_index = training_dataframe.index
testing_df_index = testing_dataframe.index
y = training_dataframe.cuisine.copy()
print("Training Data Samples: ", training_dataframe.shape)
print("Testing Data Samples: ", testing_dataframe.shape)

# Combine for pre-processing
print("Combining data for pre-processing...")
dataframe = pd.concat(
    [training_dataframe.drop('cuisine', axis=1), testing_dataframe])
dataframe_index = dataframe.index
print("Concatenated Samples: ", dataframe.shape)

# Visualise cuisine training data
Example #48
0
import json
from random import shuffle
import pandas as pd
import numpy as np
from matplotlib import pyplot
from sklearn.neighbors import NearestNeighbors

repr_json = "../output/test_representations.json"

data = pd.read_json(repr_json)

data = data.sample(n=1000)

img_repr = data['image_repr'].tolist()
text_repr = data['text_repr'].tolist()

nn = NearestNeighbors(n_jobs=-1, n_neighbors=1000)

nn.fit(text_repr)

preds = nn.kneighbors(img_repr, return_distance=False).tolist()
ranks = []

for i, x in enumerate(preds):
    rank = x.index(i) + 1
    ranks.append(rank)

print("Average rank :", np.mean(ranks))
Example #49
0
    def extractApiData(self):
        print('\nExtracting Glofas Data\n')

        # Load input data
        df_thresholds = pd.read_json(json.dumps(self.GLOFAS_STATIONS))
        df_thresholds = df_thresholds.set_index("stationCode", drop=False)
        df_district_mapping = pd.read_json(json.dumps(self.DISTRICT_MAPPING))
        df_district_mapping = df_district_mapping.set_index("glofasStation",
                                                            drop=False)

        # Set up variables to fill
        stations = []
        trigger_per_day = {
            1: 0,
            2: 0,
            3: 0,
            4: 0,
            5: 0,
            6: 0,
            7: 0,
        }

        # Load netCDF data
        ncData = xr.open_dataset(self.inputPath + 'glofas-api-' +
                                 self.country_code + '-' + self.current_date +
                                 '.nc')

        # Transform lon/lat values
        lons = np.linspace(
            ncData.dis24.attrs['GRIB_longitudeOfFirstGridPointInDegrees'],
            ncData.dis24.attrs['GRIB_longitudeOfLastGridPointInDegrees'],
            num=ncData.dis24.attrs['GRIB_Nx'])
        lats = np.linspace(
            ncData.dis24.attrs['GRIB_latitudeOfFirstGridPointInDegrees'],
            ncData.dis24.attrs['GRIB_latitudeOfLastGridPointInDegrees'],
            num=ncData.dis24.attrs['GRIB_Ny'])
        ds = ncData['dis24']
        ds.coords['latitude'] = lats
        ds.coords['longitude'] = lons
        ncData2 = ds.to_dataset()

        for index, row in df_thresholds.iterrows():
            station = {}
            station['code'] = row['stationCode']

            if station['code'] in df_district_mapping[
                    'glofasStation'] and station['code'] != 'no_station':
                print(station['code'])
                threshold = df_thresholds[df_thresholds['stationCode'] ==
                                          station['code']][TRIGGER_LEVEL][0]

                for step in range(1, 8):
                    # Loop through 51 ensembles, get forecast and compare to threshold
                    ensemble_options = 51
                    count = 0
                    dis_sum = 0

                    deltax = 0.1
                    st_lat = row['lat']  #34.05
                    st_lon = row['lon']  #0.05
                    for ensemble in range(1, ensemble_options):

                        dischargeArray = ncData2['dis24'].sel(
                            latitude=slice(st_lat + deltax, st_lat - deltax),
                            longitude=slice(st_lon - deltax, st_lon + deltax),
                            step=str(step) + ' days',
                            number=ensemble).values.flatten()
                        discharge = np.nanmax(dischargeArray)

                        # MOCK OVERWRITE DEPENDING ON COUNTRY SETTING
                        if SETTINGS_SECRET[self.country_code]['mock'] == True:
                            if SETTINGS_SECRET[self.country_code][
                                    'if_mock_trigger'] == True:
                                if step < 5:  # Only dummy trigger for 5-day and above
                                    discharge = 0
                                elif station[
                                        'code'] == 'DWRM1':  # UGA dummy flood station 1
                                    discharge = 1000
                                elif station[
                                        'code'] == 'G1067':  # ETH dummy flood station 1
                                    discharge = 1000
                                elif station[
                                        'code'] == 'G1904':  # ETH dummy flood station 2
                                    discharge = 2000
                                elif station[
                                        'code'] == 'G5194':  # KEN dummy flood station
                                    discharge = 2000
                                else:
                                    discharge = 0
                            else:
                                discharge = 0

                        if discharge >= threshold:
                            count = count + 1
                        dis_sum = dis_sum + discharge

                    prob = count / ensemble_options
                    dis_avg = dis_sum / ensemble_options
                    station['fc'] = dis_avg
                    station['fc_prob'] = prob
                    station['fc_trigger'] = 1 if prob > TRIGGER_LEVELS[
                        'minimum'] else 0

                    if station['fc_trigger'] == 1:
                        trigger_per_day[step] = 1

                    if step == self.leadTimeValue:
                        stations.append(station)
                    station = {}
                    station['code'] = row['stationCode']

        # Add 'no_station'
        for station_code in ['no_station']:
            station = {}
            station['code'] = station_code
            station['fc'] = 0
            station['fc_prob'] = 0
            station['fc_trigger'] = 0
            stations.append(station)

        with open(self.extractedGlofasPath, 'w') as fp:
            json.dump(stations, fp)
            print('Extracted Glofas data - File saved')

        with open(self.triggerPerDay, 'w') as fp:
            json.dump([trigger_per_day], fp)
            print('Extracted Glofas data - Trigger per day File saved')
Example #50
0
    if isfile(join(current_dataset_path, f)) and f.endswith('.json')
]
print("Following dataset files have been found: {}".format(all_files_names))

dataset_paths = []
for path in all_files_names:
    dataset_path = current_dataset_path + path
    dataset_paths.append(dataset_path)

print("All files found: {}".format(dataset_paths))

y_total = None
x_total = None
for (index, path) in enumerate(dataset_paths):
    print("For file at {}".format(path))
    df = pd.read_json(path)
    keypoints = df['Keypoints'].values
    file_y = df['Category'].values
    file_x = []
    for k in keypoints:
        if k is not None:
            newK = np.reshape(np.asarray(k), (25, 3))
            file_x.append(newK)
        else:
            file_x.append(np.reshape(np.zeros(75), (25, 3)))
    file_x = np.array(file_x)
    print("For file at {} found {} frames".format(path, file_y.shape[0]))
    if np.all(x_total is None):
        x_total = file_x
    else:
        x_total = np.vstack((x_total, file_x))
    def time_series_day_of_week_plot(self, data_path):
        #data_path = os.path.dirname(__file__) + "/../data/pbFollowers/merged/"
        data_path = data_path + "merged/"
        dir_files = os.listdir(data_path)

        counts_of_tweets = {}

        for filename in dir_files:
            df = pd.read_json(data_path + filename)
            df = df.set_index(df['created_at'])
            temp = pd.DatetimeIndex(df['created_at'])
            df['weekday'] = temp.weekday_name
            p = df.groupby(df['weekday'])
            freq_of_tweets = p['created_at'].count()
            freq_dict = freq_of_tweets.to_dict()
            for key, value in freq_dict.items():
                try:
                    counts_of_tweets[key] += value
                except KeyError:
                    counts_of_tweets[key] = value
            # index like freq_dict.get(pd.Timestamp('2018-01-31'))

        #dates = np.fromiter(counts_of_tweets.keys(), dtype=object)
        #counts = np.fromiter(counts_of_tweets.values(), dtype=float)

        # output to static HTML file
        tweet_freq_plot_path = data_path + "../plots/Tweet_freq_day_of_week.html"
        output_file(tweet_freq_plot_path)

        days_of_week = [
            'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
            'Saturday'
        ]
        counts_of_week = [
            counts_of_tweets[days_of_week[0]],
            counts_of_tweets[days_of_week[1]],
            counts_of_tweets[days_of_week[2]],
            counts_of_tweets[days_of_week[3]],
            counts_of_tweets[days_of_week[4]],
            counts_of_tweets[days_of_week[5]],
            counts_of_tweets[days_of_week[6]]
        ]

        fill_color = [
            "#ff0000", "#ff4000", "#ff8000", "#ffbf00", "#ffsff00", "#bfff00",
            "#80ff00"
        ]

        source = ColumnDataSource(data=dict(days_of_week=days_of_week,
                                            counts_of_week=counts_of_week,
                                            fill_color=fill_color))

        hover = HoverTool(tooltips=[
            ('days_of_week', '@days_of_week'),
            ('counts_of_week', '@counts_of_week'),
        ])

        p = figure(x_range='days_of_week',
                   plot_height=350,
                   toolbar_location=None,
                   title="Freq of Tweets by Day of Week",
                   tools=[hover])
        p.vbar(x='days_of_week',
               top='counts_of_week',
               width=0.9,
               source=source,
               line_color='white',
               fill_color='fill_color')

        #show(p)
        return p
    def time_series_frequency_analysis(self, data_path):
        #data_path = os.path.dirname(__file__) + "/../data/pbFollowers/merged/"
        data_path = data_path + "merged/"
        dir_files = os.listdir(data_path)

        counts_of_tweets = {}

        for filename in dir_files:
            df = pd.read_json(data_path + filename)
            df = df.set_index(df['created_at'])
            p = df.groupby(pd.TimeGrouper("D"))
            freq_of_tweets = p['created_at'].count()
            freq_dict = freq_of_tweets.to_dict()
            for key, value in freq_dict.items():
                try:
                    counts_of_tweets[key.to_datetime()] += value
                except KeyError:
                    counts_of_tweets[key.to_datetime()] = value
            # index like freq_dict.get(pd.Timestamp('2018-01-31'))

        dates = np.fromiter(counts_of_tweets.keys(), dtype='datetime64[us]')
        counts = np.fromiter(counts_of_tweets.values(), dtype=float)
        real_dates = [str(x)[:10] for x in dates]

        # window_size = 30
        # window = np.ones(window_size) / float(window_size)
        # counts_avg = np.convolve(counts, window, 'same')

        # output to static HTML file
        tweet_freq_plot_path = data_path + "../plots/Tweet_freq_by_day.html"
        output_file(tweet_freq_plot_path,
                    title="Tweet frequency of my followers")

        source = ColumnDataSource(
            data=dict(dates=dates, tweet_counts=counts, real_date=real_dates))

        hover = HoverTool(tooltips=[
            ('date', '@real_date'),
            ('tweet_counts', '@tweet_counts'),
        ])

        # create a new plot with a a datetime axis type
        p = figure(width=800,
                   height=350,
                   x_axis_type="datetime",
                   tools=[hover, 'box_zoom', 'pan', 'wheel_zoom', 'reset'])

        # add renderers
        #p.circle(dates, counts, size=4, color='blue', alpha=0.8)
        p.circle(x='dates',
                 y='tweet_counts',
                 size=6,
                 source=source,
                 color='blue',
                 alpha=0.6)
        # p.line(dates, counts_avg, color='grey')

        p.title.text = "Tweet Frequency of @patrickbeekman's followers"
        p.xaxis.axis_label = 'Date'
        p.yaxis.axis_label = '# of Tweets'

        #show(p)
        return p

        print("Frequency of tweets graph created!")
Example #53
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--source',
                        type=Path,
                        help='Source dir',
                        required=True)
    parser.add_argument('--videodataset',
                        type=Path,
                        default='data/dfdc_videos.pkl',
                        help='Path to save the videos DataFrame')
    parser.add_argument('--batch', type=int, help='Batch size', default=64)

    args = parser.parse_args()

    ## Parameters parsing
    source_dir: Path = args.source
    videodataset_path: Path = args.videodataset
    batch_size: int = args.batch

    ## DataFrame
    if videodataset_path.exists():
        print('Loading video DataFrame')
        df_videos = pd.read_pickle(videodataset_path)
    else:
        print('Creating video DataFrame')

        # Create ouptut folder
        videodataset_path.parent.mkdir(parents=True, exist_ok=True)

        # Index
        df_train_list = list()
        for idx, json_path in enumerate(
                tqdm(sorted(source_dir.rglob('metadata.json')),
                     desc='Indexing')):
            df_tmp = pd.read_json(json_path, orient='index')
            df_tmp['path'] = df_tmp.index.map(lambda x: str(
                json_path.parent.relative_to(source_dir).joinpath(x)))
            df_tmp['folder'] = int(str(json_path.parts[-2]).split('_')[-1])
            df_train_list.append(df_tmp)
        df_videos = pd.concat(df_train_list, axis=0, verify_integrity=True)

        # Save space
        del df_videos['split']
        df_videos['label'] = df_videos['label'] == 'FAKE'
        df_videos['original'] = df_videos['original'].astype('category')
        df_videos['folder'] = df_videos['folder'].astype(np.uint8)

        # Collect metadata
        paths_arr = np.asarray(
            df_videos.path.map(lambda x: str(source_dir.joinpath(x))))
        height_list = []
        width_list = []
        frames_list = []
        with Pool() as pool:
            for batch_idx0 in tqdm(np.arange(start=0,
                                             stop=len(df_videos),
                                             step=batch_size),
                                   desc='Metadata'):
                batch_res = pool.map(
                    extract_meta_av,
                    paths_arr[batch_idx0:batch_idx0 + batch_size])
                for res in batch_res:
                    height_list.append(res[0])
                    width_list.append(res[1])
                    frames_list.append(res[2])

        df_videos['height'] = np.asarray(height_list, dtype=np.uint16)
        df_videos['width'] = np.asarray(width_list, dtype=np.uint16)
        df_videos['frames'] = np.asarray(frames_list, dtype=np.uint16)

        print('Saving video DataFrame to {}'.format(videodataset_path))
        df_videos.to_pickle(str(videodataset_path))

    print('Real videos: {:d}'.format(sum(df_videos['label'] == 0)))
    print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
Example #54
0
    b = df_fluxo_de_pessoas['tarde'] / df_quantidade_de_dias_semana['tarde']
    c = df_fluxo_de_pessoas['noite'] / df_quantidade_de_dias_semana['noite']
    df_fluxo_medio_pessoas = pd.concat([a, b, c], axis=1, sort=False)
    ####CRIAÇÃO DO FLUXXOO
    df_fluxo_medio_pessoas.to_csv('Resource/fluxo_medio_pessoas.csv',
                                  sep=';',
                                  index=True)


if __name__ == "__main__":

    ####CONCORRENTE
    #path_current = os.getcwd()
    #path_complet_with_csv = '/Dados_Cliente/concorrentes.csv'
    df_concorrente = pd.read_csv('Dados_Cliente/concorrentes.csv')
    novo_concorrente(df_concorrente)
    ###BAIRRRO
    #path_complet_with_csv = os.path.join(path_current +'/Dados_Cliente/populacao.json')
    populacao = pd.read_json('Dados_Cliente/populacao.json')
    populacao = pd.DataFrame(data=populacao)

    #path_complet_with_csv = os.path.join(path_current +'/Dados_Cliente/bairros.csv')
    bairro = pd.read_csv('Dados_Cliente/bairros.csv')
    novo_bairro(bairro, populacao)

    ###CHAMANDO EVENTOS
    #path_complet_with_csv = os.path.join(path_current +'/Dados_Cliente/eventos_de_fluxo.csv')
    eventos_fluxo = pd.read_csv('Dados_Cliente/eventos_de_fluxo.csv')
    calculo_dias(eventos_fluxo)
    print('Concluido')
Example #55
0
# MONGO_COLLECTION = 'shop_high0727'
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

def save_to_mongo(result):
    """
    保存至MongoDB
    :param result: 结果
    """
    try:
        if db[MONGO_COLLECTION].insert(result):
            print('存储到MongoDB成功')
    except Exception:
        print('存储到MongoDB失败')
        
df1 = pd.read_json('e-cigar-high.json')
df2 = pd.read_json('e-cigar-low.json')
df = df1.append(df2)
shop_link = df['s_link'].value_counts().index.tolist()
print('共有{}家店'.format(len(shop_link)))
c = 0
for link in shop_link:
	c += 1
	shop_id = link[link.find('id=')+3:]
	print('查找第{}号店铺'.format(c))
	url = 'https://hdc1.alicdn.com/asyn.htm?userId='+shop_id
	res = requests.get(url).text
	ch2del = ('\\r\\n','\r\n','\\','\t','\n')
	for ch in ch2del:
		res = res.replace(ch,'')
	doc = pq(res)
Example #56
0
def load_world_bank_health_n_pop():
    """Loads the world bank health dataset, slices and a dashboard"""
    tbl_name = 'wb_health_population'
    with gzip.open(os.path.join(DATA_FOLDER, 'countries.json.gz')) as f:
        pdf = pd.read_json(f)
    pdf.columns = [col.replace('.', '_') for col in pdf.columns]
    pdf.year = pd.to_datetime(pdf.year)
    pdf.to_sql(tbl_name,
               db.engine,
               if_exists='replace',
               chunksize=50,
               dtype={
                   'year': DateTime(),
                   'country_code': String(3),
                   'country_name': String(255),
                   'region': String(255),
               },
               index=False)

    print("Creating table [wb_health_population] reference")
    tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first()
    if not tbl:
        tbl = TBL(table_name=tbl_name)
    tbl.description = utils.readfile(os.path.join(DATA_FOLDER, 'countries.md'))
    tbl.main_dttm_col = 'year'
    tbl.is_featured = True
    tbl.database = get_or_create_main_db()
    db.session.merge(tbl)
    db.session.commit()
    tbl.fetch_metadata()

    defaults = {
        "compare_lag": "10",
        "compare_suffix": "o10Y",
        "datasource_id": "1",
        "datasource_name": "birth_names",
        "datasource_type": "table",
        "limit": "25",
        "granularity": "year",
        "groupby": [],
        "metric": 'sum__SP_POP_TOTL',
        "metrics": ["sum__SP_POP_TOTL"],
        "row_limit": config.get("ROW_LIMIT"),
        "since": "2014-01-01",
        "until": "2014-01-02",
        "where": "",
        "markup_type": "markdown",
        "country_fieldtype": "cca3",
        "secondary_metric": "sum__SP_POP_TOTL",
        "entity": "country_code",
        "show_bubbles": "y",
    }

    print("Creating slices")
    slices = [
        Slice(slice_name="Region Filter",
              viz_type='filter_box',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    viz_type='filter_box',
                                    groupby=['region', 'country_name'])),
        Slice(slice_name="World's Population",
              viz_type='big_number',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    since='2000',
                                    viz_type='big_number',
                                    compare_lag="10",
                                    metric='sum__SP_POP_TOTL',
                                    compare_suffix="over 10Y")),
        Slice(slice_name="Most Populated Countries",
              viz_type='table',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    viz_type='table',
                                    metrics=["sum__SP_POP_TOTL"],
                                    groupby=['country_name'])),
        Slice(slice_name="Growth Rate",
              viz_type='line',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    viz_type='line',
                                    since="1960-01-01",
                                    metrics=["sum__SP_POP_TOTL"],
                                    num_period_compare="10",
                                    groupby=['country_name'])),
        Slice(slice_name="% Rural",
              viz_type='world_map',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(defaults,
                                    viz_type='world_map',
                                    metric="sum__SP_RUR_TOTL_ZS",
                                    num_period_compare="10")),
        Slice(slice_name="Life Expectancy VS Rural %",
              viz_type='bubble',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(
                  defaults,
                  viz_type='bubble',
                  since="2011-01-01",
                  until="2011-01-02",
                  series="region",
                  limit="0",
                  entity="country_name",
                  x="sum__SP_RUR_TOTL_ZS",
                  y="sum__SP_DYN_LE00_IN",
                  size="sum__SP_POP_TOTL",
                  max_bubble_size="50",
                  flt_col_1="country_code",
                  flt_op_1="not in",
                  flt_eq_1=
                  "TCA,MNP,DMA,MHL,MCO,SXM,CYM,TUV,IMY,KNA,ASM,ADO,AMA,PLW",
                  num_period_compare="10",
              )),
        Slice(slice_name="Rural Breakdown",
              viz_type='sunburst',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(
                  defaults,
                  viz_type='sunburst',
                  groupby=["region", "country_name"],
                  secondary_metric="sum__SP_RUR_TOTL",
                  since="2011-01-01",
                  until="2011-01-01",
              )),
        Slice(slice_name="World's Pop Growth",
              viz_type='area',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(
                  defaults,
                  since="1960-01-01",
                  until="now",
                  viz_type='area',
                  groupby=["region"],
              )),
        Slice(slice_name="Box plot",
              viz_type='box_plot',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(
                  defaults,
                  since="1960-01-01",
                  until="now",
                  whisker_options="Min/max (no outliers)",
                  viz_type='box_plot',
                  groupby=["region"],
              )),
        Slice(slice_name="Treemap",
              viz_type='treemap',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(
                  defaults,
                  since="1960-01-01",
                  until="now",
                  viz_type='treemap',
                  metrics=["sum__SP_POP_TOTL"],
                  groupby=["region", "country_code"],
              )),
        Slice(slice_name="Parallel Coordinates",
              viz_type='para',
              datasource_type='table',
              datasource_id=tbl.id,
              params=get_slice_json(
                  defaults,
                  since="2011-01-01",
                  until="2011-01-01",
                  viz_type='para',
                  limit=100,
                  metrics=[
                      "sum__SP_POP_TOTL", 'sum__SP_RUR_TOTL_ZS',
                      'sum__SH_DYN_AIDS'
                  ],
                  secondary_metric='sum__SP_POP_TOTL',
                  series="country_name",
              )),
    ]
    misc_dash_slices.append(slices[-1].slice_name)
    for slc in slices:
        merge_slice(slc)

    print("Creating a World's Health Bank dashboard")
    dash_name = "World's Bank Data"
    slug = "world_health"
    dash = db.session.query(Dash).filter_by(slug=slug).first()

    if not dash:
        dash = Dash()
    js = textwrap.dedent("""\
    [
        {
            "col": 1,
            "row": 0,
            "size_x": 2,
            "size_y": 2,
            "slice_id": "1231"
        },
        {
            "col": 1,
            "row": 2,
            "size_x": 2,
            "size_y": 2,
            "slice_id": "1232"
        },
        {
            "col": 10,
            "row": 0,
            "size_x": 3,
            "size_y": 7,
            "slice_id": "1233"
        },
        {
            "col": 1,
            "row": 4,
            "size_x": 6,
            "size_y": 3,
            "slice_id": "1234"
        },
        {
            "col": 3,
            "row": 0,
            "size_x": 7,
            "size_y": 4,
            "slice_id": "1235"
        },
        {
            "col": 5,
            "row": 7,
            "size_x": 8,
            "size_y": 4,
            "slice_id": "1236"
        },
        {
            "col": 7,
            "row": 4,
            "size_x": 3,
            "size_y": 3,
            "slice_id": "1237"
        },
        {
            "col": 1,
            "row": 7,
            "size_x": 4,
            "size_y": 4,
            "slice_id": "1238"
        },
        {
            "col": 9,
            "row": 11,
            "size_x": 4,
            "size_y": 4,
            "slice_id": "1239"
        },
        {
            "col": 1,
            "row": 11,
            "size_x": 8,
            "size_y": 4,
            "slice_id": "1240"
        }
    ]
    """)
    l = json.loads(js)
    for i, pos in enumerate(l):
        pos['slice_id'] = str(slices[i].id)

    dash.dashboard_title = dash_name
    dash.position_json = json.dumps(l, indent=4)
    dash.slug = slug

    dash.slices = slices[:-1]
    db.session.merge(dash)
    db.session.commit()
Example #57
0
import imageio
from amftrack.pipeline.functions.image_processing.experiment_class_surf import (
    Experiment,
    save_graphs,
    load_graphs,
)
from amftrack.transfer.functions.transfer import upload, zip_file
from amftrack.pipeline.functions.post_processing.extract_study_zone import (
    load_study_zone,
)

directory = str(sys.argv[1])
overwrite = eval(sys.argv[2])
i = int(sys.argv[-1])
op_id = int(sys.argv[-2])
run_info = pd.read_json(f"{temp_path}/{op_id}.json")
list_f, list_args = pickle.load(open(f"{temp_path}/{op_id}.pick", "rb"))
folder_list = list(run_info["folder_analysis"])
directory_name = folder_list[i]
select = run_info.loc[run_info["folder_analysis"] == directory_name]
row = [row for index, row in select.iterrows()][0]
plate_num = row["Plate"]
path_exp = f'{directory}{row["path_exp"]}'
exp = pickle.load(open(path_exp, "rb"))
try:
    exp.labeled
except AttributeError:
    exp.labeled = True

load_study_zone(exp)
Example #58
0
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

client_credentials_manager = SpotifyClientCredentials(
    client_id="CLIENT_ID", client_secret="CLIENT_SECRET")

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

df = pd.read_json('raw/data.json')


def get_audio_features(title, artist):
    print("Searching Spotify for", title, "by", artist)

    songs = sp.search(q='track:' + title + ' artist:' + artist + '*',
                      type='track')

    items = songs['tracks']['items']

    if len(items) > 0:
        print("Getting audio features")

        song_id = str(items[0]['id'])
        features = sp.audio_features(song_id)[0]

        if len(features) >= 18:
            return features

    print("None found")
Example #59
0
def gera_box():
    dados = pd.read_json(diretorio + jsonBox)
    print('\n Boxplot dos tweets:')
    dados.boxplot(column=['Polaridade', 'Subjetividade'], vert=False)
    mpl.show()
    street_hash = feature_hash.get_feature_set(street_list)
    rent_frame.drop(text_columns,axis=1,inplace=True)
    numerical_features = rent_frame.values
    return numpy.hstack((numerical_features,description_hash,features_list_hash,address_hash,street_hash))


def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test):
    print '--------- For Model : ', trained_model_name, '--------------------'
    predicted_values = trained_model.predict(X_test)
    print metrics.classification_report(y_test,predicted_values)
    print "Accuracy Score : ",metrics.accuracy_score(y_test,predicted_values)
    print "---------------------------------------\n"


filename = 'train.json'
rent_frame = pd.read_json(filename)
print rent_frame.columns
print len(rent_frame)
print rent_frame.head(3)
target_variable = 'interest_level'
columns_to_delete = ['building_id','created','listing_id','manager_id','photos','interest_level']
class_labels = list(rent_frame[target_variable].values)
text_columns = ['description','features','display_address','street_address']
rent_frame.drop(columns_to_delete,axis=1,inplace=True)
full_features = get_label_encoded_features(rent_frame,text_columns)
X_train,X_test,y_train,y_test = train_test_split(full_features,class_labels,test_size=0.2,random_state=42)
classifier_list,classifier_name_list = get_ensemble_models()
for classifier,classifier_name in zip(classifier_list,classifier_name_list):
    classifier.fit(X_train,y_train)
    print_evaluation_metrics(classifier,classifier_name,X_test,y_test)