def _from_json(): try: return pd.read_json(json) except ValueError: return pd.read_json(json, typ="series") except: raise
def legacy_kdata_to_csv(): for index, security_item in get_security_list().iterrows(): for fuquan in (True, False): dir = get_kdata_dir_old(security_item, fuquan) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('all' not in f and 'json' in f and os.path.isfile(os.path.join(dir, f)))] for f in files: tmp = os.path.basename(f).split('_') if fuquan: csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'hfq') if not os.path.exists(csv_path): df = pd.read_json(f, dtype={'code': str}) logger.info("{} to {}".format(f, csv_path)) df = df.loc[:, ['timestamp', 'code', 'low', 'open', 'close', 'high', 'volume', 'turnover', 'securityId', 'fuquan']] df.columns = KDATA_COLUMN_FQ df.to_csv(csv_path, index=False) else: csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'bfq') if not os.path.exists(csv_path): df = pd.read_json(f, dtype={'code': str}) logger.info("{} to {}".format(f, csv_path)) df = df.loc[:, KDATA_COLUMN] df.to_csv(csv_path, index=False)
def main(): emotions = pd.read_json('data/emotions.json') landmarks = pd.read_json('data/landmarks.json') facs = pd.read_json('data/facs.json') header = ['min', 'max', 'mean', 'std', 'count'] print('===Emotions===') emotions_dict = emotions[['emotion']].describe().to_dict() emotions_dict['emotion']['count (incl. null values)'] = len(emotions) print(wiki_tables(emotions_dict, header+['count (incl. null values)'])) print('===Landmarks===') print(wiki_tables(landmarks[['x', 'y']].describe().to_dict(), header)) print('===FACS===') print("====Summary====") print(wiki_tables(facs[['au', 'intensity']].describe().to_dict(), header)) print("====Details for each AU====") grouped_facs = facs.groupby(['au'])['intensity'].describe().to_dict() facs_dict = {} for au, field in grouped_facs: if not au in facs_dict: facs_dict[au] = {} facs_dict[au][field] = grouped_facs[(au, field)] print(wiki_tables(facs_dict, header, title='AU'))
def read_files(): # I hard-coded file names article_art = pd.read_json(open("./article.art")) article_rev = pd.read_json(open("./article.rev")) article_rev["is_talk"] = 0 talk_art = pd.read_json(open("./talk.art")) talk_rev = pd.read_json(open("./talk.rev")) talk_rev["is_talk"] = 1 revs = pd.concat([talk_rev, article_rev]) arts = article_art.set_index("title").join(talk_art.set_index("title"), rsuffix="_talk").reset_index() if "anon" not in revs.columns: revs["anon"] = np.nan if "minor" not in revs.columns: revs["minor"] = np.nan if "suppressed" not in revs.columns: revs["suppressed"] = np.nan if "userhidden" not in revs.columns: revs["userhidden"] = np.nan revs.loc[~revs.anon.isnull(), "anon"] = 1 revs.loc[~revs.minor.isnull(), "minor"] = 1 revs.loc[~revs.suppressed.isnull(), "suppressed"] = 1 revs.loc[~revs.userhidden.isnull(), "userhidden"] = 1 revs.loc[revs.anon.isnull(), "anon"] = 0 revs.loc[revs.minor.isnull(), "minor"] = 0 revs.loc[revs.suppressed.isnull(), "suppressed"] = 0 revs.loc[revs.userhidden.isnull(), "userhidden"] = 0 return [arts, revs]
def jiejin_quarter(self): #利用解禁数据的时间序列合成解禁数据 framewhatday1 = pd.DataFrame(columns=self.data_frame.columns) framewhatday2 = pd.DataFrame(columns=self.data_frame.columns) framewhatday3 = pd.DataFrame(columns=self.data_frame.columns) netprofit = pd.read_json('dataframe/' + NETPROFIT_NETPROFIT + '.json') netprofit = netprofit.sort_index(axis=1) holder_top10pct = pd.read_json('dataframe/' + ES_HOLDERS_PCT_HOLDER_TOP10PCT + '.json') holder_top10pct = holder_top10pct.sort_index(axis=1) holder_pctbyinst = pd.read_json('dataframe/' + ES_HOLDERS_PCT_HOLDER_PCTBYINST + '.json') holder_pctbyinst = holder_pctbyinst.sort_index(axis=1) for date in self.datelist: quarterday = what_quarter(str(date).split()[0]) if quarterday[1] in netprofit.index: framewhatday1.loc[date] = (netprofit.loc[quarterday[0]] / netprofit.loc[quarterday[1]] - 1) else: framewhatday1.loc[date] = None if quarterday[2] in holder_top10pct.index: framewhatday2.loc[date] = holder_top10pct.loc[quarterday[2]] / 100 else: framewhatday2.loc[date] = 0 if quarterday[2] in holder_pctbyinst.index: framewhatday3.loc[date] = holder_pctbyinst.loc[quarterday[2]] / 100 else: framewhatday3.loc[date] = 0 framewhatday1 = framewhatday1.fillna(0) #净利润为空的直接设为0 return {JIEJIN_DATE:self.data_frame,NETPROFIT_NETPROFIT:framewhatday1,ES_HOLDERS_PCT_HOLDER_TOP10PCT:framewhatday2,ES_HOLDERS_PCT_HOLDER_PCTBYINST:framewhatday3}
def test_lines_with_compression(compression): with tm.ensure_clean() as path: df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') df.to_json(path, orient='records', lines=True, compression=compression) roundtripped_df = pd.read_json(path, lines=True, compression=compression) assert_frame_equal(df, roundtripped_df)
def setUp(self): answers = gpd.read_file(DIRPATH + '/answers.geojson') tests = pd.read_json(DIRPATH + '/tests.json') hard_tests = pd.read_json(DIRPATH + '/degenerate.json') self.all = answers.merge(tests, on='names').merge(hard_tests, on='names') self.conn = Connection('DECENNIALSF12010') self.conn.set_mapservice('tigerWMS_Census2010')
def get(self, source='train'): ''' Take data from source file and create object instanse with data from file. Parameters ---------- source : string can be 'train' or 'test' and show what file take for pretaring. Results ------- train_data : Pandas DataFrame, [m rows x n columns] example for train mode: ----------------------- index cuisine id ingredients 0 greek 10259 [romaine lettuce, black... 1 southern_us 25693 [plain flour, ground pe... 2 filipino 20130 [eggs, pepper, salt, ma... test_data : Pandas DataFrame, [m rows x n columns] example for test mode: ---------------------- index id ingredients 0 10259 [romaine lettuce, black olives, grape to... 1 25693 [plain flour, ground pepper, salt, tomat... 2 20130 [eggs, pepper, salt, mayonaise, cooking ... ''' data_file_name = self.folder + source + '.json' if source == 'train': self.train_data = pd.read_json(data_file_name) # self._vocabulary() self.test_data = pd.read_json(data_file_name)
def main(): d = json.loads(sys.stdin.read()) print('*** EM iterations ***') for i in range(4): #s = arbplf_newton_point(json.dumps(d)) s = arbplf_coeff_expect(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) r = list(df.value) d['model_and_data']['edge_rate_coefficients'] = r print('EM summary:') print(r) summarize(d) print() print('*** newton iterations ***') for i in range(6): s = arbplf_newton_point(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) r = list(df.value) d['model_and_data']['edge_rate_coefficients'] = r print('newton summary:') print(r) summarize(d) print()
def test_frame_non_unique_columns(self): df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 2], columns=["x", "x"]) self.assertRaises(ValueError, df.to_json, orient="index") self.assertRaises(ValueError, df.to_json, orient="columns") self.assertRaises(ValueError, df.to_json, orient="records") assert_frame_equal(df, read_json(df.to_json(orient="split"), orient="split", dtype=False)) unser = read_json(df.to_json(orient="values"), orient="values") np.testing.assert_equal(df.values, unser.values) # GH4377; duplicate columns not processing correctly df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 2], columns=["x", "y"]) result = read_json(df.to_json(orient="split"), orient="split") assert_frame_equal(result, df) def _check(df): result = read_json(df.to_json(orient="split"), orient="split", convert_dates=["x"]) assert_frame_equal(result, df) for o in [ [["a", "b"], ["c", "d"]], [[1.5, 2.5], [3.5, 4.5]], [[1, 2.5], [3, 4.5]], [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]], ]: _check(DataFrame(o, index=[1, 2], columns=["x", "x"]))
def read_salmon_qc(sample_path, flen_lim=(100, 100), version='0.7.2'): ''' Parse technical quality control data from a Salmon quantification result. Parameters ---------- flen_lim, tuple (int start, int end), default (100, 100) How many bases to remove from start and end of fragment length distribution when calculating the robust mode. This is too see if things roughly worked out even if the max FLD Salmon parameter was set too small. version, str, default '0.6.0' The version of Salmon which generated the directory. Currently supports '0.6.0' and '0.4.0'. (Other versions might be compatible with these.) Returns ------- A pandas.Series with technical information from the Salmon results for the sample. ''' try: flen_dist = np.fromfile(sample_path + '/libParams/flenDist.txt', sep='\t') global_fl_mode = flen_dist.argmax() robust_fl_mode = flen_dist[flen_lim[0]:-flen_lim[1]].argmax() + flen_lim[0] except FileNotFoundError: global_fl_mode = 0 robust_fl_mode = 0 if version == '0.7.2': qc_data = pd.read_json(sample_path + '/aux_info/meta_info.json', typ='series') qc_data = qc_data[['num_processed', 'num_mapped', 'percent_mapped']] qc_data['global_fl_mode'] = global_fl_mode qc_data['robust_fl_mode'] = robust_fl_mode if version == '0.6.0': qc_data = pd.read_json(sample_path + '/aux/meta_info.json', typ='series') qc_data = qc_data[['num_processed', 'num_mapped', 'percent_mapped']] qc_data['global_fl_mode'] = global_fl_mode qc_data['robust_fl_mode'] = robust_fl_mode if version == '0.4.0': qc_data = pd.Series() log_file = sample_path + '/logs/salmon_quant.log' with open(log_file) as fh: for l in fh: if 'Observed ' in l: frags = int(l.split('Observed ')[-1].split(' total')[0]) qc_data['num_processed'] = frags if 'mapping rate' in l: rate = float(l.split(' = ')[1].split('%')[0]) qc_data['percent_mapped'] = rate qc_data['global_fl_mode'] = global_fl_mode qc_data['robust_fl_mode'] = robust_fl_mode return qc_data
def test_v12_compat(self): df = DataFrame( [ [1.56808523, 0.65727391, 1.81021139, -0.17251653], [-0.2550111, -0.08072427, -0.03202878, -0.17581665], [1.51493992, 0.11805825, 1.629455, -1.31506612], [-0.02765498, 0.44679743, 0.33192641, -0.27885413], [0.05951614, -2.69652057, 1.28163262, 0.34703478], ], columns=["A", "B", "C", "D"], index=pd.date_range("2000-01-03", "2000-01-07"), ) df["date"] = pd.Timestamp("19920106 18:21:32.12") df.ix[3, "date"] = pd.Timestamp("20130101") df["modified"] = df["date"] df.ix[1, "modified"] = pd.NaT v12_json = os.path.join(self.dirpath, "tsframe_v012.json") df_unser = pd.read_json(v12_json) assert_frame_equal(df, df_unser) df_iso = df.drop(["modified"], axis=1) v12_iso_json = os.path.join(self.dirpath, "tsframe_iso_v012.json") df_unser_iso = pd.read_json(v12_iso_json) assert_frame_equal(df_iso, df_unser_iso)
def test_frame_mixedtype_orient(self): # GH10289 vals = [ [10, 1, "foo", 0.1, 0.01], [20, 2, "bar", 0.2, 0.02], [30, 3, "baz", 0.3, 0.03], [40, 4, "qux", 0.4, 0.04], ] df = DataFrame(vals, index=list("abcd"), columns=["1st", "2nd", "3rd", "4th", "5th"]) self.assertTrue(df._is_mixed_type) right = df.copy() for orient in ["split", "index", "columns"]: inp = df.to_json(orient=orient) left = read_json(inp, orient=orient, convert_axes=False) assert_frame_equal(left, right) right.index = np.arange(len(df)) inp = df.to_json(orient="records") left = read_json(inp, orient="records", convert_axes=False) assert_frame_equal(left, right) right.columns = np.arange(df.shape[1]) inp = df.to_json(orient="values") left = read_json(inp, orient="values", convert_axes=False) assert_frame_equal(left, right)
def test_timedelta(self): converter = lambda x: pd.to_timedelta(x, unit="ms") s = Series([timedelta(23), timedelta(seconds=5)]) self.assertEqual(s.dtype, "timedelta64[ns]") # index will be float dtype assert_series_equal(s, pd.read_json(s.to_json(), typ="series").apply(converter), check_index_type=False) s = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1], dtype=float)) self.assertEqual(s.dtype, "timedelta64[ns]") assert_series_equal(s, pd.read_json(s.to_json(), typ="series").apply(converter)) frame = DataFrame([timedelta(23), timedelta(seconds=5)]) self.assertEqual(frame[0].dtype, "timedelta64[ns]") assert_frame_equal( frame, pd.read_json(frame.to_json()).apply(converter), check_index_type=False, check_column_type=False ) frame = DataFrame( { "a": [timedelta(days=23), timedelta(seconds=5)], "b": [1, 2], "c": pd.date_range(start="20130101", periods=2), } ) result = pd.read_json(frame.to_json(date_unit="ns")) result["a"] = pd.to_timedelta(result.a, unit="ns") result["c"] = pd.to_datetime(result.c) assert_frame_equal(frame, result, check_index_type=False)
def test_frame_non_unique_columns(self): df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 2], columns=['x', 'x']) self.assertRaises(ValueError, df.to_json, orient='index') self.assertRaises(ValueError, df.to_json, orient='columns') self.assertRaises(ValueError, df.to_json, orient='records') assert_frame_equal(df, read_json(df.to_json(orient='split'), orient='split', dtype=False)) unser = read_json(df.to_json(orient='values'), orient='values') tm.assert_numpy_array_equal(df.values, unser.values) # GH4377; duplicate columns not processing correctly df = DataFrame([['a', 'b'], ['c', 'd']], index=[ 1, 2], columns=['x', 'y']) result = read_json(df.to_json(orient='split'), orient='split') assert_frame_equal(result, df) def _check(df): result = read_json(df.to_json(orient='split'), orient='split', convert_dates=['x']) assert_frame_equal(result, df) for o in [[['a', 'b'], ['c', 'd']], [[1.5, 2.5], [3.5, 4.5]], [[1, 2.5], [3, 4.5]], [[Timestamp('20130101'), 3.5], [Timestamp('20130102'), 4.5]]]: _check(DataFrame(o, index=[1, 2], columns=['x', 'x']))
def test_frame_mixedtype_orient(self): # GH10289 vals = [[10, 1, 'foo', .1, .01], [20, 2, 'bar', .2, .02], [30, 3, 'baz', .3, .03], [40, 4, 'qux', .4, .04]] df = DataFrame(vals, index=list('abcd'), columns=['1st', '2nd', '3rd', '4th', '5th']) self.assertTrue(df._is_mixed_type) right = df.copy() for orient in ['split', 'index', 'columns']: inp = df.to_json(orient=orient) left = read_json(inp, orient=orient, convert_axes=False) assert_frame_equal(left, right) right.index = np.arange(len(df)) inp = df.to_json(orient='records') left = read_json(inp, orient='records', convert_axes=False) assert_frame_equal(left, right) right.columns = np.arange(df.shape[1]) inp = df.to_json(orient='values') left = read_json(inp, orient='values', convert_axes=False) assert_frame_equal(left, right)
def searchByName(self, searchByName=""): searchParameters = 'searchByName=%s' % searchByName print searchParameters if cherrypy.request.method != 'GET': cherrypy.response.headers['Allow'] = 'GET' raise cherrypy.HTTPError(405) if not searchByName: return "Please enter a valid name" if self.searchManager(searchParameters): cand_ids, cand_comms = self.searches[searchParameters] if not cand_ids or not cand_comms: return self.noResultsFound() cand_ids = pandas.read_json(json.dumps(cand_ids)).to_html() cand_comms = pandas.read_json(json.dumps(cand_comms), orient='index').to_html() return cand_ids+cand_comms else: s = self.__SearchLocation(self.__Connection) parameters = {'name':searchByName} cand_ids, cand_comms = s.search_by_name(parameters) if not cand_ids or not cand_comms: return self.noResultsFound() self.searches[searchParameters] = (cand_ids, cand_comms) cand_ids = pandas.read_json(json.dumps(cand_ids)).to_html() cand_comms = pandas.read_json(json.dumps(cand_comms), orient='index').to_html() cherrypy.response.headers['Content-Type'] = 'text/html' return cand_ids+cand_comms
def get_all_vocabulary(self): all_codes, code_types = self.get_meth_codes() ## do it this way if you want a non-nested list of all codes ## i.e. er_codes = [code1, code2,...] ##def get_one_meth_category(category, all_codes, code_types): ## do it this way if you want a tiered list of all codes ## i.e. er_codes = {'anisotropy_codes': ['code1', 'code2'], ...} ##def get_tiered_meth_category(mtype, all_codes, code_types): if any(all_codes): methods = self.get_tiered_meth_category('other', all_codes, code_types) age_methods = self.get_tiered_meth_category('age', all_codes, code_types) else: methods = self.get_tiered_meth_category_offline() age_methods = self.get_tiered_meth_category_offline() path = os.path.join(data_model_dir, 'code_types.txt') with open(path, 'r') as type_file: raw_code_types = json.load(type_file) code_types = pd.read_json(raw_code_types) path = os.path.join(data_model_dir, 'all_codes.txt') with open(path, 'r') as code_file: raw_all_codes = json.load(code_file) all_codes = pd.read_json(raw_all_codes) vocabularies = self.get_controlled_vocabularies() suggested = self.get_suggested_vocabularies() self.vocabularies = vocabularies self.suggested = suggested #self.possible_vocabularies = possible_vocabularies self.all_codes = all_codes self.code_types = code_types self.methods = methods self.age_methods = age_methods
def GET(self): web.header('Access-Control-Allow-Origin', '*') output = dict() getInput = web.input(start='2012-3-03 16:00:00', end='2012-3-03 21:00:00') start_time=pd.to_datetime(getInput.start).tz_localize('US/Eastern') - pd.DateOffset(hours=10) end_time=pd.to_datetime(getInput.end).tz_localize('US/Eastern') - pd.DateOffset(hours=10) output_nodes = set() all_schedules = pd.read_json('all_schedules.json') allnodes = pd.read_json('allnodes.json') nodes = set(allnodes.nodes) all_schedules['end'] = all_schedules['end'].map(lambda x: datetime.datetime.fromtimestamp(x/1000000000)) all_schedules['start'] = all_schedules['start'].map(lambda x: datetime.datetime.fromtimestamp(x/1000000000)) night_sched = all_schedules[(all_schedules.start >= start_time) & (all_schedules.end <= end_time)] on_nodes = set() for idx,show in night_sched.iterrows(): on_nodes.add(show[2]) off_nodes = nodes.difference(on_nodes) imported_graph = nx.read_gexf('./finished_network3.gexf') for i in off_nodes: try: imported_graph.remove_node(i) except: continue pr=nx.pagerank(imported_graph,alpha=0.9,weight='newweight',tol=.01, max_iter=200) output['nodes'] = [(i,v*1000000) for i,v in pr.items()] output['input_params'] = getInput return json.dumps(output)
def get_plot_data(): df = pd.read_json(URL_TOTALES_PRESIDENTE) totales = df[df.provincia == 99] totales.sort('porc_final_agrupacion', ascending=False) codigos = totales.codigo_agrupacion.values porcentajes = totales.porc_final_agrupacion.astype(float).values porcentajes *= 0.01 porcentajes = list(porcentajes) primero, segundo = porcentajes[:2] # if primero >= 45: # falta = 0 falta = max(40 , (segundo + 10)) - primero listas = pd.read_json(URL_LISTAS) def get_candidatos_lista(c): return listas[listas.codigo == c].siglas.values[0] fuerzas = [get_candidatos_lista(c) for c in codigos] candidatos = [CANDIDATOS[f] for f in fuerzas] data = (candidatos, (porcentajes, falta)) json_dump_unicode(data, "databokeh.json") return data
def main(): with open('data.txt', 'r') as f: #print f.read() #di = json.loads(f.read()) data_2007 = pd.read_json(f.read()) cols = ['tmp', 'thunder', 'press', 'hum', 'date', 'rain'] train_y = data_2007['sales'] train_x = data_2007[cols] print train_x.shape clf_lr = RandomForestClassifier(n_estimators=50, n_jobs=-1) clf_lr.fit(train_x, train_y) with open('data2.txt', 'r') as f2: data_2008 = pd.read_json(f2.read()) pr = clf_lr.predict(data_2008[['tmp', 'thunder', 'press', 'hum', 'date', 'rain']]) print pr with open('data2.txt', 'r') as f: elems = json.loads(f.read()) m=0 for i in pr: elems[m]["sales"]= i m+=1 l = [[int(time.mktime(datetime.datetime.strptime(i["date"], "%Y%m%d").timetuple()))*1000,i["sales"]] for i in elems] #y = [i["sales"] for i in elems] print l
def test_readjson_chunks_from_file(): with ensure_clean('test.json') as path: df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) df.to_json(path, lines=True, orient="records") chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1)) unchunked = pd.read_json(path, lines=True) assert_frame_equal(unchunked, chunked)
def gatherStats3(fileLoc, force=False): dirName = fileLoc.split("/")[-2] outFilename = PROCESSED_FILE_LOC + "stats_" + dirName + ".json" if force or not isfile(outFilename): startYear = int(dirName.split("_")[1]) endYear = int(dirName.split("_")[2]) fileList = listdir(fileLoc) print "fileListLen:", len(fileList) stats = pd.DataFrame(index = range(startYear, endYear+1)) for f in fileList: path = join(fileLoc, f) if isfile(path) and getsize(path) > 0 and f!='dummy.json': quotes = pd.read_json(path) quotes['Year'] = quotes['Date'].apply(lambda x: x.year) grp = quotes[['Year', 'Date']].groupby(['Year']).count() symbol = f.split('.')[0] s = pd.Series(0, index=stats.index) for year in grp.index: s[year] = grp.loc[year]['Date'] stats[symbol] = s stats.to_json(outFilename) else: stats = pd.read_json(outFilename) return stats
def test_timedelta(self): converter = lambda x: pd.to_timedelta(x, unit='ms') s = Series([timedelta(23), timedelta(seconds=5)]) self.assertEqual(s.dtype, 'timedelta64[ns]') result = pd.read_json(s.to_json(), typ='series').apply(converter) assert_series_equal(result, s) s = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1])) self.assertEqual(s.dtype, 'timedelta64[ns]') result = pd.read_json(s.to_json(), typ='series').apply(converter) assert_series_equal(result, s) frame = DataFrame([timedelta(23), timedelta(seconds=5)]) self.assertEqual(frame[0].dtype, 'timedelta64[ns]') assert_frame_equal(frame, pd.read_json(frame.to_json()) .apply(converter)) frame = DataFrame({'a': [timedelta(days=23), timedelta(seconds=5)], 'b': [1, 2], 'c': pd.date_range(start='20130101', periods=2)}) result = pd.read_json(frame.to_json(date_unit='ns')) result['a'] = pd.to_timedelta(result.a, unit='ns') result['c'] = pd.to_datetime(result.c) assert_frame_equal(frame, result)
def test_read_zipped_json(): uncompressed_path = tm.get_data_path("tsframe_v012.json") uncompressed_df = pd.read_json(uncompressed_path) compressed_path = tm.get_data_path("tsframe_v012.json.zip") compressed_df = pd.read_json(compressed_path, compression='zip') assert_frame_equal(uncompressed_df, compressed_df)
def add_new_url_list(new): old = pn.read_json("urls_linkedin.json") new = pn.read_json(new) data = old.append(new) data.drop_duplicates(inplace=True,take_last=True) data.reset_index(drop=True,inplace=True) data.to_json("urls_linkedin.json") return "new urls added"
def test_series_non_unique_index(self): s = Series(["a", "b"], index=[1, 1]) self.assertRaises(ValueError, s.to_json, orient="index") assert_series_equal(s, read_json(s.to_json(orient="split"), orient="split", typ="series")) unser = read_json(s.to_json(orient="records"), orient="records", typ="series") np.testing.assert_equal(s.values, unser.values)
def test_read_zipped_json(datapath): uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json") uncompressed_df = pd.read_json(uncompressed_path) compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip") compressed_df = pd.read_json(compressed_path, compression='zip') assert_frame_equal(uncompressed_df, compressed_df)
def test_chunksize_with_compression(compression): with tm.ensure_clean() as path: df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}') df.to_json(path, orient='records', lines=True, compression=compression) roundtripped_df = pd.concat(pd.read_json(path, lines=True, chunksize=1, compression=compression)) assert_frame_equal(df, roundtripped_df)
def pred_hotstar(): import pandas as pd from sklearn.ensemble import RandomForestClassifier df_train = pd.read_json("~/Documents/dataset/hotstar/train_data.json") df_test = pd.read_json("~/Documents/dataset/hotstar/test_data.json") df_train.head()
from sklearn.feature_extraction.text import TfidfVectorizer if ((len(sys.argv)) == 1): sys.exit("test or train arg is missing.") test_train = sys.argv[1].strip().lower() if (test_train != 'test' and test_train != 'train'): sys.exit("Invalid argument passed.") dataset_dir = 'data/{}.json'.format(test_train) output_dir = 'data/{}/'.format(test_train) stop_words = set(stopwords.words('english')) porter = PorterStemmer() data = pd.read_json(dataset_dir, convert_dates=['created']) def parse_text(text): soup = BeautifulSoup(desc) return ' ' + soup.get_text() def clean_text(text): text = text.lower() tokens = word_tokenize(text) words = [] for token in tokens: if (token.isalpha()): words.append(porter.stem(token)) cleaned_text = ' '.join(words)
def load_energy(): """Loads an energy related dataset to use with sankey and graphs""" tbl_name = 'energy_usage' with gzip.open(os.path.join(DATA_FOLDER, 'energy.json.gz')) as f: pdf = pd.read_json(f) pdf.to_sql(tbl_name, db.engine, if_exists='replace', chunksize=500, dtype={ 'source': String(255), 'target': String(255), 'value': Float(), }, index=False) print("Creating table [wb_health_population] reference") tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first() if not tbl: tbl = TBL(table_name=tbl_name) tbl.description = "Energy consumption" tbl.is_featured = True tbl.database = get_or_create_main_db() db.session.merge(tbl) db.session.commit() tbl.fetch_metadata() slc = Slice(slice_name="Energy Sankey", viz_type='sankey', datasource_type='table', datasource_id=tbl.id, params=textwrap.dedent("""\ { "collapsed_fieldsets": "", "datasource_id": "3", "datasource_name": "energy_usage", "datasource_type": "table", "flt_col_0": "source", "flt_eq_0": "", "flt_op_0": "in", "groupby": [ "source", "target" ], "having": "", "metric": "sum__value", "row_limit": "5000", "slice_name": "Energy Sankey", "viz_type": "sankey", "where": "" } """)) misc_dash_slices.append(slc.slice_name) merge_slice(slc) slc = Slice(slice_name="Energy Force Layout", viz_type='directed_force', datasource_type='table', datasource_id=tbl.id, params=textwrap.dedent("""\ { "charge": "-500", "collapsed_fieldsets": "", "datasource_id": "1", "datasource_name": "energy_usage", "datasource_type": "table", "flt_col_0": "source", "flt_eq_0": "", "flt_op_0": "in", "groupby": [ "source", "target" ], "having": "", "link_length": "200", "metric": "sum__value", "row_limit": "5000", "slice_name": "Force", "viz_type": "directed_force", "where": "" } """)) misc_dash_slices.append(slc.slice_name) merge_slice(slc) slc = Slice(slice_name="Heatmap", viz_type='heatmap', datasource_type='table', datasource_id=tbl.id, params=textwrap.dedent("""\ { "all_columns_x": "source", "all_columns_y": "target", "canvas_image_rendering": "pixelated", "collapsed_fieldsets": "", "datasource_id": "1", "datasource_name": "energy_usage", "datasource_type": "table", "flt_col_0": "source", "flt_eq_0": "", "flt_op_0": "in", "having": "", "linear_color_scheme": "blue_white_yellow", "metric": "sum__value", "normalize_across": "heatmap", "slice_name": "Heatmap", "viz_type": "heatmap", "where": "", "xscale_interval": "1", "yscale_interval": "1" } """)) misc_dash_slices.append(slc.slice_name) merge_slice(slc)
import numpy as np import pandas as pd import matplotlib.pyplot as plt import datetime df = pd.read_json('amsterdam_sold_geo_4pp.json') df.reset_index(drop=True) df['price per unit area'] = df['price'] / df['area'] df['postdate'] = pd.to_datetime(df['posting_date'], dayfirst=True) df['saledate'] = pd.to_datetime(df['sale_date'], dayfirst=True) # print(df['saledate'] - df['postdate']) df['time_to_sell'] = df['saledate'] - df['postdate'] dfp = df[df['price'] > 50000] dfp['days_to_sell'] = dfp['time_to_sell'].apply(lambda x: x.days) dfp = dfp[dfp['days_to_sell'] > 0] dfp_week = dfp.groupby(pd.TimeGrouper(key='saledate', freq='W')).mean() fig, (ax1, ax2) = plt.subplots(2, sharex=True) ax1.scatter(dfp['saledate'].values, dfp['price'] / 1000.0, s=10) ax1.plot_date(dfp_week.index, dfp_week['price'] / 1000.0, 'r-', linewidth=2) ax1.set_xlim([datetime.datetime(2015, 4, 1), datetime.datetime(2016, 8, 1)]) ax1.set_xlabel('Date of sale') ax1.set_ylabel(u'Asking price (1,000 EUR)') ax1.set_title('Property sales in Amsterdam') ax1.set_ylim([0, 800]) ax1.grid('on')
# -*- coding: utf-8 -*- """ Created on Thu Aug 20 14:09:50 2015 @author: btrani """ import numpy as np import pandas as pd import datetime import urllib # Read in our data. We've aggregated it by date already, so we don't need to worry about paging query = ( 'https://data.smgov.net/resource/xx64-wi4x.json?$$app_token=g5iIFV3PzVEgEGqkxekFlTlxW' ) raw_data = pd.read_json(query)
def extractFtpData(self): print('\nExtracting FTP Glofas Data\n') files = [ f for f in listdir(self.inputPath) if isfile(join(self.inputPath, f)) and f.endswith('.nc') ] df_thresholds = pd.read_json(json.dumps(self.GLOFAS_STATIONS)) df_thresholds = df_thresholds.set_index("stationCode", drop=False) df_district_mapping = pd.read_json(json.dumps(self.DISTRICT_MAPPING)) df_district_mapping = df_district_mapping.set_index("glofasStation", drop=False) stations = [] trigger_per_day = { 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, } for i in range(0, len(files)): logging.info("Extracting glofas data from %s", i) Filename = os.path.join(self.inputPath, files[i]) station = {} station['code'] = files[i].split('_')[2] data = xr.open_dataset(Filename) # Get threshold for this specific station if station['code'] in df_thresholds['stationCode'] and station[ 'code'] in df_district_mapping['glofasStation']: print(Filename) threshold = df_thresholds[df_thresholds['stationCode'] == station['code']][TRIGGER_LEVEL][0] # Set dimension-values time = 0 for step in range(1, 8): # Loop through 51 ensembles, get forecast and compare to threshold ensemble_options = 51 count = 0 dis_sum = 0 for ensemble in range(0, ensemble_options): discharge = data['dis'].sel(ensemble=ensemble, step=step).values[time][0] # DUMMY OVERWRITE DEPENDING ON COUNTRY SETTING if SETTINGS_SECRET[self.country_code]['mock'] == True: if SETTINGS_SECRET[self.country_code][ 'if_mock_trigger'] == True: if step < 5: # Only dummy trigger for 5-day and above discharge = 0 elif station[ 'code'] == 'G1361': # ZMB dummy flood station 1 discharge = 8000 elif station[ 'code'] == 'G1328': # ZMB dummy flood station 2 discharge = 9000 else: discharge = 0 else: discharge = 0 if discharge >= threshold: count = count + 1 dis_sum = dis_sum + discharge prob = count / ensemble_options dis_avg = dis_sum / ensemble_options station['fc'] = dis_avg station['fc_prob'] = prob station['fc_trigger'] = 1 if prob > TRIGGER_LEVELS[ 'minimum'] else 0 if station['fc_trigger'] == 1: trigger_per_day[step] = 1 if step == self.leadTimeValue: stations.append(station) station = {} station['code'] = files[i].split('_')[2] data.close() # Add 'no_station' for station_code in ['no_station']: station = {} station['code'] = station_code station['fc'] = 0 station['fc_prob'] = 0 station['fc_trigger'] = 0 stations.append(station) with open(self.extractedGlofasPath, 'w') as fp: json.dump(stations, fp) print('Extracted Glofas data - File saved') with open(self.triggerPerDay, 'w') as fp: json.dump([trigger_per_day], fp) print('Extracted Glofas data - Trigger per day File saved')
import pydeck as pdk import pandas as pd GPU_GRID_LAYER_DATA = ("https://raw.githubusercontent.com/uber-common/" "deck.gl-data/master/website/sf-bike-parking.json") df = pd.read_json(GPU_GRID_LAYER_DATA) # Define a layer to display on a map layer = pdk.Layer( "GPUGridLayer", df, pickable=True, extruded=True, cellSize=200, elevation_scale=4, get_position="COORDINATES", ) # Set the viewport location view_state = pdk.ViewState(latitude=37.7749295, longitude=-122.4194155, zoom=11, bearing=0, pitch=45) # Render r = pdk.Deck( layers=[layer], initial_view_state=view_state, tooltip={"text": "{position}\nCount: {count}"}, )
def read_json( cls, path_or_buf=None, orient=None, typ="frame", dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, lines=False, chunksize=None, compression="infer", ): kwargs = { "path_or_buf": path_or_buf, "orient": orient, "typ": typ, "dtype": dtype, "convert_axes": convert_axes, "convert_dates": convert_dates, "keep_default_dates": keep_default_dates, "numpy": numpy, "precise_float": precise_float, "date_unit": date_unit, "encoding": encoding, "lines": lines, "chunksize": chunksize, "compression": compression, } if cls.read_json_remote_task is None: return super(RayIO, cls).read_json(**kwargs) if not lines: ErrorMessage.default_to_pandas( "`read_json` only optimized with `lines=True`" ) return super(RayIO, cls).read_json(**kwargs) else: # TODO: Pick up the columns in an optimized way from all data # All rows must be read because some rows may have missing data # Currently assumes all rows have the same columns from io import BytesIO columns = pandas.read_json( BytesIO(b"" + open(path_or_buf, "rb").readline()), lines=True ).columns kwargs["columns"] = columns empty_pd_df = pandas.DataFrame(columns=columns) path_or_buf = kwargs.pop("path_or_buf") with file_open(path_or_buf, "rb", kwargs.get("compression", "infer")) as f: total_bytes = file_size(f) from modin.pandas import DEFAULT_NPARTITIONS num_partitions = DEFAULT_NPARTITIONS num_splits = min(len(columns), num_partitions) chunk_size = max(1, (total_bytes - f.tell()) // num_partitions) partition_ids = [] index_ids = [] dtypes_ids = [] column_chunksize = compute_chunksize(empty_pd_df, num_splits, axis=1) if column_chunksize > len(columns): column_widths = [len(columns)] num_splits = 1 else: column_widths = [ column_chunksize if i != num_splits - 1 else len(columns) - (column_chunksize * (num_splits - 1)) for i in range(num_splits) ] while f.tell() < total_bytes: start = f.tell() f.seek(chunk_size, os.SEEK_CUR) f.readline() partition_id = cls.read_json_remote_task._remote( args=(path_or_buf, num_splits, start, f.tell(), kwargs), num_return_vals=num_splits + 3, ) partition_ids.append(partition_id[:-3]) index_ids.append(partition_id[-3]) dtypes_ids.append(partition_id[-2]) row_lengths = ray.get(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) dtypes = ( pandas.concat(ray.get(dtypes_ids), axis=1) .apply(lambda row: find_common_type(row.values), axis=1) .squeeze(axis=0) ) partition_ids = [ [ cls.frame_partition_cls( partition_ids[i][j], length=row_lengths[i], width=column_widths[j], ) for j in range(len(partition_ids[i])) ] for i in range(len(partition_ids)) ] if isinstance(dtypes, pandas.Series): dtypes.index = columns else: dtypes = pandas.Series(dtypes, index=columns) new_frame = cls.frame_cls( np.array(partition_ids), new_index, columns, row_lengths, column_widths, dtypes=dtypes, ) new_frame._apply_index_objs(axis=0) return cls.query_compiler_cls(new_frame)
def update_font(p_dfs, lang): start_time = time.time() pages = len(p_dfs) new_dfs = [] try: for page_index in range(pages): page_df = p_dfs[page_index] page_df = page_df.where(page_df.notnull(), None) page_lis = [] child_lis = [] for index, row in page_df.iterrows(): if row['children'] == None: page_lis.append(change_font(row["font_family"], lang)) child_lis.append(row['children']) else: sub_block_children = pd.read_json(row['children']) sub_block_children = sub_block_children.where( sub_block_children.notnull(), None) page_lis1 = [] child_lis1 = [] for index2, row2 in sub_block_children.iterrows(): if row2['children'] == None: child_lis1.append(row2['children']) page_lis1.append( change_font(row2["font_family"], lang)) else: sub2_block_children = pd.read_json( row2['children']) sub2_block_children = sub2_block_children.where( sub2_block_children.notnull(), None) page_lis2 = [] for index3, row3 in sub2_block_children.iterrows(): page_lis2.append( change_font(row3["font_family"], lang)) sub2_block_children['font_family'] = page_lis2 #print(sub2_block_children) page_lis1.append( max(set(page_lis2), key=page_lis2.count)) child_lis1.append(sub2_block_children.to_json()) #print(child_lis1) sub_block_children['font_family'] = page_lis1 sub_block_children['children'] = child_lis1 page_lis.append(max(set(page_lis1), key=page_lis1.count)) child_lis.append(sub_block_children.to_json()) page_df['font_family'] = page_lis page_df['children'] = child_lis new_dfs.append(page_df) end_time = time.time() extraction_time = end_time - start_time log_info('Updating of fonts completed in {}'.format(extraction_time), app_context.application_context) except Exception as e: log_error('Error in updating fonts', app_context.application_context, e) return None return new_dfs
RMSErbarsum = 0 it = 0 conn = comm.openServerConn(args.backend_port, args.backend_ip) batch_id = 0 while True: batch_id += 1 print("Waiting for new batch...") try: comm.sendMessage(conn, "ACK") except BrokenPipeError: print("Connection with frontend terminated!") break msg = comm.getMessage(conn, isJSON=True) # fetch data from frontend mbatch = pd.read_json(msg) """Batching phase""" start = time.time() for index, row in mbatch.iterrows(): '''batch1: Get new observation (user, item, rating, timestamp)''' user = int(row['user']) item_id = int(row['item']) rating = float(row['rating']) timestamp = int(row['unixtimestamp']) '''batch2: Add it to fresh_ratings dictionary''' if user in tempTimestep: tempTimestep[user] += 1 else: tempTimestep[user] = 1 fresh_ratings[(user, item_id)] = (rating, tempTimestep[user]) if rmse: '''batch3a: Predict'''
# -*- coding: utf-8 -*- """ Created on Tue Nov 19 13:09:26 2019 Applied Statistical Methods Final Project Data Processing/ Formatting Code @author: Ellie Frith """ import pandas as pd import numpy as np import re pompeo_df = pd.read_json('sec_pompeo_tweets.json') trump_df = pd.read_json('trump_tweets.json') #Response from Twitter API has lots of extraneous information and dense #dictionaries: want to extract the information useful for this analysis def media_type(extended_entities): '''This function extracts the media type of any media attached to a tweet (image, video, etc). If tweet has no media, the function will return a value of nan. ''' if pd.isnull(extended_entities)==True: media = np.nan
def load_multiformat_time_series_data(): """Loading time series data from a zip file in the repo""" with gzip.open(os.path.join(DATA_FOLDER, 'multiformat_time_series.json.gz')) as f: pdf = pd.read_json(f) pdf.ds = pd.to_datetime(pdf.ds, unit='s') pdf.ds2 = pd.to_datetime(pdf.ds2, unit='s') pdf.to_sql('multiformat_time_series', db.engine, if_exists='replace', chunksize=500, dtype={ "ds": Date, 'ds2': DateTime, "epoch_s": BigInteger, "epoch_ms": BigInteger, "string0": String(100), "string1": String(100), "string2": String(100), "string3": String(100), }, index=False) print("Done loading table!") print("-" * 80) print("Creating table [multiformat_time_series] reference") obj = db.session.query(TBL).filter_by( table_name='multiformat_time_series').first() if not obj: obj = TBL(table_name='multiformat_time_series') obj.main_dttm_col = 'ds' obj.database = get_or_create_main_db() obj.is_featured = False dttm_and_expr_dict = { 'ds': [None, None], 'ds2': [None, None], 'epoch_s': ['epoch_s', None], 'epoch_ms': ['epoch_ms', None], 'string2': ['%Y%m%d-%H%M%S', None], 'string1': ['%Y-%m-%d^%H:%M:%S', None], 'string0': ['%Y-%m-%d %H:%M:%S.%f', None], 'string3': ['%Y/%m/%d%H:%M:%S.%f', None], } for col in obj.columns: dttm_and_expr = dttm_and_expr_dict[col.column_name] col.python_date_format = dttm_and_expr[0] col.dbatabase_expr = dttm_and_expr[1] col.is_dttm = True db.session.merge(obj) db.session.commit() obj.fetch_metadata() tbl = obj print("Creating some slices") for i, col in enumerate(tbl.columns): slice_data = { "granularity_sqla": col.column_name, "datasource_id": "8", "datasource_name": "multiformat_time_series", "datasource_type": "table", "granularity": "day", "row_limit": config.get("ROW_LIMIT"), "since": "1 year ago", "until": "now", "where": "", "viz_type": "cal_heatmap", "domain_granularity": "month", "subdomain_granularity": "day", } slc = Slice( slice_name="Calendar Heatmap multiformat " + str(i), viz_type='cal_heatmap', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(slice_data), ) merge_slice(slc) misc_dash_slices.append(slc.slice_name)
import os import glob import psycopg2 import pandas as pd from sql_queries import * def process_song_file(cur, filepath): """Reads songs log file row by row, selects needed fields and inserts them into song and artist tables. Parameters: cur (psycopg2.cursor()): Cursor of the sparkifydb database filepath (str): Filepath of the file to be analyzed """ # open song file df = pd.read_json(filepath, lines=True) # insert song record song_data = df[["song_id", "title", "artist_id", "year", "duration"]].values[0].tolist() cur.execute(song_table_insert, song_data) # insert artist record artist_data = df[["artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude"]].values[0].tolist() cur.execute(artist_table_insert, artist_data) def process_log_file(cur, filepath): """ This function reads Log files and reads information of time, user and songplay data and saves into time, user, songplay Arguments: cur: Database Cursor filepath: location of Log files
def json2dtf(jsondata): ''' get json as pandas ''' dtf = pd.read_json(jsondata) return dtf
def load_birth_names(): """Loading birth name dataset from a zip file in the repo""" with gzip.open(os.path.join(DATA_FOLDER, 'birth_names.json.gz')) as f: pdf = pd.read_json(f) pdf.ds = pd.to_datetime(pdf.ds, unit='ms') pdf.to_sql('birth_names', db.engine, if_exists='replace', chunksize=500, dtype={ 'ds': DateTime, 'gender': String(16), 'state': String(10), 'name': String(255), }, index=False) l = [] print("Done loading table!") print("-" * 80) print("Creating table [birth_names] reference") obj = db.session.query(TBL).filter_by(table_name='birth_names').first() if not obj: obj = TBL(table_name='birth_names') obj.main_dttm_col = 'ds' obj.database = get_or_create_main_db() obj.is_featured = True db.session.merge(obj) db.session.commit() obj.fetch_metadata() tbl = obj defaults = { "compare_lag": "10", "compare_suffix": "o10Y", "datasource_id": "1", "datasource_name": "birth_names", "datasource_type": "table", "flt_op_1": "in", "limit": "25", "granularity": "ds", "groupby": [], "metric": 'sum__num', "metrics": ["sum__num"], "row_limit": config.get("ROW_LIMIT"), "since": "100 years ago", "until": "now", "viz_type": "table", "where": "", "markup_type": "markdown", } print("Creating some slices") slices = [ Slice(slice_name="Girls", viz_type='table', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, groupby=['name'], flt_col_1='gender', flt_eq_1="girl", row_limit=50)), Slice(slice_name="Boys", viz_type='table', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, groupby=['name'], flt_col_1='gender', flt_eq_1="boy", row_limit=50)), Slice(slice_name="Participants", viz_type='big_number', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, viz_type="big_number", granularity="ds", compare_lag="5", compare_suffix="over 5Y")), Slice(slice_name="Genders", viz_type='pie', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, viz_type="pie", groupby=['gender'])), Slice(slice_name="Genders by State", viz_type='dist_bar', datasource_type='table', datasource_id=tbl.id, params=get_slice_json( defaults, flt_eq_1="other", viz_type="dist_bar", metrics=['sum__sum_girls', 'sum__sum_boys'], groupby=['state'], flt_op_1='not in', flt_col_1='state')), Slice(slice_name="Trends", viz_type='line', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, viz_type="line", groupby=['name'], granularity='ds', rich_tooltip='y', show_legend='y')), Slice(slice_name="Average and Sum Trends", viz_type='dual_line', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, viz_type="dual_line", metric='avg__num', metric_2='sum__num', granularity='ds')), Slice(slice_name="Title", viz_type='markup', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, viz_type="markup", markup_type="html", code="""\ <div style="text-align:center"> <h1>Birth Names Dashboard</h1> <p> The source dataset came from <a href="https://github.com/hadley/babynames">[here]</a> </p> <img src="/static/assets/images/babytux.jpg"> </div> """)), Slice(slice_name="Name Cloud", viz_type='word_cloud', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, viz_type="word_cloud", size_from="10", series='name', size_to="70", rotation="square", limit='100')), Slice(slice_name="Pivot Table", viz_type='pivot_table', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, viz_type="pivot_table", metrics=['sum__num'], groupby=['name'], columns=['state'])), Slice(slice_name="Number of Girls", viz_type='big_number_total', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, viz_type="big_number_total", granularity="ds", flt_col_1='gender', flt_eq_1='girl', subheader='total female participants')), ] for slc in slices: merge_slice(slc) print("Creating a dashboard") dash = db.session.query(Dash).filter_by(dashboard_title="Births").first() if not dash: dash = Dash() js = textwrap.dedent("""\ [ { "col": 9, "row": 6, "size_x": 2, "size_y": 4, "slice_id": "1267" }, { "col": 11, "row": 6, "size_x": 2, "size_y": 4, "slice_id": "1268" }, { "col": 1, "row": 0, "size_x": 2, "size_y": 2, "slice_id": "1269" }, { "col": 3, "row": 0, "size_x": 2, "size_y": 2, "slice_id": "1270" }, { "col": 5, "row": 3, "size_x": 8, "size_y": 3, "slice_id": "1271" }, { "col": 1, "row": 6, "size_x": 8, "size_y": 4, "slice_id": "1272" }, { "col": 10, "row": 0, "size_x": 3, "size_y": 3, "slice_id": "1273" }, { "col": 5, "row": 0, "size_x": 5, "size_y": 3, "slice_id": "1274" }, { "col": 1, "row": 2, "size_x": 4, "size_y": 4, "slice_id": "1275" } ] """) l = json.loads(js) for i, pos in enumerate(l): pos['slice_id'] = str(slices[i].id) dash.dashboard_title = "Births" dash.position_json = json.dumps(l, indent=4) dash.slug = "births" dash.slices = slices[:-1] db.session.merge(dash) db.session.commit()
import pandas as pd import matplotlib, openblender, wordcloud action = 'API_getObservationsFromDataset' parameters = { 'token': 'YOUR_TOKEN_HERE', 'id_dataset': '5e6ac97595162921fda18076', 'date_filter': { "start_date": "2020-01-01T06:00:00.000Z", "end_date": "2020-03-11T06:00:00.000Z" }, } df_confirmed = pd.read_json(json.dumps( OpenBlender.call(action, parameters)['sample']), convert_dates=False, convert_axes=False).sort_values('timestamp', ascending=False) df_confirmed.reset_index(drop=True, inplace=True) df_confirmed.head(10)
def plot_history(history, metric, orient='index', validation=False, title='Metric Function', xlabel='epochs', ylabel='metric', logx=False, logy=False, smooth=None, smooth_window=10, alpha=None, legend='auto', bbox_to_anchor=(1.0, 0.0), anchor='lower left', base_size=(640, 480), base_ratio=(1, 1), subplots=None, return_ax=False, out_name=None, dpi=72, **kwargs): ''' Plot the a metric or loss function from a Keras history. Required arguments: history: location of the JSON file containing the history of the training, metric: the name of the metric to plot or list of metrics,. Optional arguments: orient: orientation of the JSON file (for Pandas), out_name: name of the ouput (without extension), root: root of the save location, validation: plot validation loss, title: title of the plot, logx: use log scale on x-axis, logy: use log scale on y-axis, smooth: smoothen the behaviour by considering a moving average, smooth_window: width of the smoothing window, alpha: transparency factor, legend: type of legend to draw (https://seaborn.pydata.org/generated/seaborn.scatterplot.html#seaborn.scatterplot), bbox_to_anchor: position of the legend, anchor: anchor of the legend, base_size: tuple with the size of the output, base_ratio: ratio of the output plot (tuple), subplots: pass a tuple of (figure, axis) to use existing axis, return_ax: return the axis object if requested, out_name: name of the ouput (without extension), **kwargs: additional arguments to pass to savefig. ''' # open JSON file if isinstance(history, str): hst = pd.read_json(history, orient=orient) else: hst = pd.DataFrame(history) # select data if not isinstance(metric, list): metric = [metric] # plot the function if subplots is not None: fig, ax = subplots else: X, Y = base_ratio fig, ax = plt.subplots(1, 1, figsize=ratio(base_ratio[0], base_ratio[1], base=base_size, dpi=dpi), dpi=dpi) # define the palettes plot_palette_train = ['tab:blue', 'tab:cyan', 'tab:purple', 'tab:grey'] plot_palette_val = ['tab:red', 'tab:orange', 'tab:pink', 'tab:olive'] # plot the metrics plot_metrics = [] plot_palette = {} plot_dashes = {} plot_hst = {} for n, m in enumerate(metric): plot_metrics.append(m) plot_palette[m] = plot_palette_train[n] plot_dashes[m] = '-' # smooth if requested if smooth: if 1 < smooth_window < np.shape(hst[m])[0]: plot_hst[m] = np.convolve(hst[m], np.ones(smooth_window) / smooth_window, mode='valid') else: plot_hst[m] = hst[m] if validation: plot_metrics.append('val_' + m) plot_palette['val_' + m] = plot_palette_val[n] plot_dashes['val_' + m] = '--' # smooth if requested if smooth: if 1 < smooth_window < np.shape(hst['val_' + m])[0]: plot_hst['val_' + m] = np.convolve(hst['val_' + m], np.ones(smooth_window) / smooth_window, mode='valid') else: plot_hst['val_' + m] = hst['val_' + m] # plot the new metrics plot_hst = pd.DataFrame(plot_hst) for var in plot_metrics: lineplot = { 'data': plot_hst, 'x': plot_hst.index, 'y': var, 'alpha': alpha, 'linestyle': plot_dashes[var], 'color': plot_palette[var], 'legend': legend } sns.lineplot(**lineplot, ax=ax) ax.set_title(title) ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) if len(plot_metrics) > 1: ax.legend(labels=plot_metrics, bbox_to_anchor=bbox_to_anchor, loc=anchor) if logx: ax.set_xscale('log') if logy: ax.set_yscale('log') # save the figure if out_name is not None: savefig(out_name, fig, dpi=dpi, **kwargs) if return_ax: return ax
Anthony Ralston - [email protected] """ # Import required libraries from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import json # Read json data & prepare dataframe print("Reading json data & preparing dataframes...") training_dataframe = pd.read_json('train.json').set_index('id') testing_dataframe = pd.read_json('test.json').set_index('id') training_df_index = training_dataframe.index testing_df_index = testing_dataframe.index y = training_dataframe.cuisine.copy() print("Training Data Samples: ", training_dataframe.shape) print("Testing Data Samples: ", testing_dataframe.shape) # Combine for pre-processing print("Combining data for pre-processing...") dataframe = pd.concat( [training_dataframe.drop('cuisine', axis=1), testing_dataframe]) dataframe_index = dataframe.index print("Concatenated Samples: ", dataframe.shape) # Visualise cuisine training data
import json from random import shuffle import pandas as pd import numpy as np from matplotlib import pyplot from sklearn.neighbors import NearestNeighbors repr_json = "../output/test_representations.json" data = pd.read_json(repr_json) data = data.sample(n=1000) img_repr = data['image_repr'].tolist() text_repr = data['text_repr'].tolist() nn = NearestNeighbors(n_jobs=-1, n_neighbors=1000) nn.fit(text_repr) preds = nn.kneighbors(img_repr, return_distance=False).tolist() ranks = [] for i, x in enumerate(preds): rank = x.index(i) + 1 ranks.append(rank) print("Average rank :", np.mean(ranks))
def extractApiData(self): print('\nExtracting Glofas Data\n') # Load input data df_thresholds = pd.read_json(json.dumps(self.GLOFAS_STATIONS)) df_thresholds = df_thresholds.set_index("stationCode", drop=False) df_district_mapping = pd.read_json(json.dumps(self.DISTRICT_MAPPING)) df_district_mapping = df_district_mapping.set_index("glofasStation", drop=False) # Set up variables to fill stations = [] trigger_per_day = { 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, } # Load netCDF data ncData = xr.open_dataset(self.inputPath + 'glofas-api-' + self.country_code + '-' + self.current_date + '.nc') # Transform lon/lat values lons = np.linspace( ncData.dis24.attrs['GRIB_longitudeOfFirstGridPointInDegrees'], ncData.dis24.attrs['GRIB_longitudeOfLastGridPointInDegrees'], num=ncData.dis24.attrs['GRIB_Nx']) lats = np.linspace( ncData.dis24.attrs['GRIB_latitudeOfFirstGridPointInDegrees'], ncData.dis24.attrs['GRIB_latitudeOfLastGridPointInDegrees'], num=ncData.dis24.attrs['GRIB_Ny']) ds = ncData['dis24'] ds.coords['latitude'] = lats ds.coords['longitude'] = lons ncData2 = ds.to_dataset() for index, row in df_thresholds.iterrows(): station = {} station['code'] = row['stationCode'] if station['code'] in df_district_mapping[ 'glofasStation'] and station['code'] != 'no_station': print(station['code']) threshold = df_thresholds[df_thresholds['stationCode'] == station['code']][TRIGGER_LEVEL][0] for step in range(1, 8): # Loop through 51 ensembles, get forecast and compare to threshold ensemble_options = 51 count = 0 dis_sum = 0 deltax = 0.1 st_lat = row['lat'] #34.05 st_lon = row['lon'] #0.05 for ensemble in range(1, ensemble_options): dischargeArray = ncData2['dis24'].sel( latitude=slice(st_lat + deltax, st_lat - deltax), longitude=slice(st_lon - deltax, st_lon + deltax), step=str(step) + ' days', number=ensemble).values.flatten() discharge = np.nanmax(dischargeArray) # MOCK OVERWRITE DEPENDING ON COUNTRY SETTING if SETTINGS_SECRET[self.country_code]['mock'] == True: if SETTINGS_SECRET[self.country_code][ 'if_mock_trigger'] == True: if step < 5: # Only dummy trigger for 5-day and above discharge = 0 elif station[ 'code'] == 'DWRM1': # UGA dummy flood station 1 discharge = 1000 elif station[ 'code'] == 'G1067': # ETH dummy flood station 1 discharge = 1000 elif station[ 'code'] == 'G1904': # ETH dummy flood station 2 discharge = 2000 elif station[ 'code'] == 'G5194': # KEN dummy flood station discharge = 2000 else: discharge = 0 else: discharge = 0 if discharge >= threshold: count = count + 1 dis_sum = dis_sum + discharge prob = count / ensemble_options dis_avg = dis_sum / ensemble_options station['fc'] = dis_avg station['fc_prob'] = prob station['fc_trigger'] = 1 if prob > TRIGGER_LEVELS[ 'minimum'] else 0 if station['fc_trigger'] == 1: trigger_per_day[step] = 1 if step == self.leadTimeValue: stations.append(station) station = {} station['code'] = row['stationCode'] # Add 'no_station' for station_code in ['no_station']: station = {} station['code'] = station_code station['fc'] = 0 station['fc_prob'] = 0 station['fc_trigger'] = 0 stations.append(station) with open(self.extractedGlofasPath, 'w') as fp: json.dump(stations, fp) print('Extracted Glofas data - File saved') with open(self.triggerPerDay, 'w') as fp: json.dump([trigger_per_day], fp) print('Extracted Glofas data - Trigger per day File saved')
if isfile(join(current_dataset_path, f)) and f.endswith('.json') ] print("Following dataset files have been found: {}".format(all_files_names)) dataset_paths = [] for path in all_files_names: dataset_path = current_dataset_path + path dataset_paths.append(dataset_path) print("All files found: {}".format(dataset_paths)) y_total = None x_total = None for (index, path) in enumerate(dataset_paths): print("For file at {}".format(path)) df = pd.read_json(path) keypoints = df['Keypoints'].values file_y = df['Category'].values file_x = [] for k in keypoints: if k is not None: newK = np.reshape(np.asarray(k), (25, 3)) file_x.append(newK) else: file_x.append(np.reshape(np.zeros(75), (25, 3))) file_x = np.array(file_x) print("For file at {} found {} frames".format(path, file_y.shape[0])) if np.all(x_total is None): x_total = file_x else: x_total = np.vstack((x_total, file_x))
def time_series_day_of_week_plot(self, data_path): #data_path = os.path.dirname(__file__) + "/../data/pbFollowers/merged/" data_path = data_path + "merged/" dir_files = os.listdir(data_path) counts_of_tweets = {} for filename in dir_files: df = pd.read_json(data_path + filename) df = df.set_index(df['created_at']) temp = pd.DatetimeIndex(df['created_at']) df['weekday'] = temp.weekday_name p = df.groupby(df['weekday']) freq_of_tweets = p['created_at'].count() freq_dict = freq_of_tweets.to_dict() for key, value in freq_dict.items(): try: counts_of_tweets[key] += value except KeyError: counts_of_tweets[key] = value # index like freq_dict.get(pd.Timestamp('2018-01-31')) #dates = np.fromiter(counts_of_tweets.keys(), dtype=object) #counts = np.fromiter(counts_of_tweets.values(), dtype=float) # output to static HTML file tweet_freq_plot_path = data_path + "../plots/Tweet_freq_day_of_week.html" output_file(tweet_freq_plot_path) days_of_week = [ 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday' ] counts_of_week = [ counts_of_tweets[days_of_week[0]], counts_of_tweets[days_of_week[1]], counts_of_tweets[days_of_week[2]], counts_of_tweets[days_of_week[3]], counts_of_tweets[days_of_week[4]], counts_of_tweets[days_of_week[5]], counts_of_tweets[days_of_week[6]] ] fill_color = [ "#ff0000", "#ff4000", "#ff8000", "#ffbf00", "#ffsff00", "#bfff00", "#80ff00" ] source = ColumnDataSource(data=dict(days_of_week=days_of_week, counts_of_week=counts_of_week, fill_color=fill_color)) hover = HoverTool(tooltips=[ ('days_of_week', '@days_of_week'), ('counts_of_week', '@counts_of_week'), ]) p = figure(x_range='days_of_week', plot_height=350, toolbar_location=None, title="Freq of Tweets by Day of Week", tools=[hover]) p.vbar(x='days_of_week', top='counts_of_week', width=0.9, source=source, line_color='white', fill_color='fill_color') #show(p) return p
def time_series_frequency_analysis(self, data_path): #data_path = os.path.dirname(__file__) + "/../data/pbFollowers/merged/" data_path = data_path + "merged/" dir_files = os.listdir(data_path) counts_of_tweets = {} for filename in dir_files: df = pd.read_json(data_path + filename) df = df.set_index(df['created_at']) p = df.groupby(pd.TimeGrouper("D")) freq_of_tweets = p['created_at'].count() freq_dict = freq_of_tweets.to_dict() for key, value in freq_dict.items(): try: counts_of_tweets[key.to_datetime()] += value except KeyError: counts_of_tweets[key.to_datetime()] = value # index like freq_dict.get(pd.Timestamp('2018-01-31')) dates = np.fromiter(counts_of_tweets.keys(), dtype='datetime64[us]') counts = np.fromiter(counts_of_tweets.values(), dtype=float) real_dates = [str(x)[:10] for x in dates] # window_size = 30 # window = np.ones(window_size) / float(window_size) # counts_avg = np.convolve(counts, window, 'same') # output to static HTML file tweet_freq_plot_path = data_path + "../plots/Tweet_freq_by_day.html" output_file(tweet_freq_plot_path, title="Tweet frequency of my followers") source = ColumnDataSource( data=dict(dates=dates, tweet_counts=counts, real_date=real_dates)) hover = HoverTool(tooltips=[ ('date', '@real_date'), ('tweet_counts', '@tweet_counts'), ]) # create a new plot with a a datetime axis type p = figure(width=800, height=350, x_axis_type="datetime", tools=[hover, 'box_zoom', 'pan', 'wheel_zoom', 'reset']) # add renderers #p.circle(dates, counts, size=4, color='blue', alpha=0.8) p.circle(x='dates', y='tweet_counts', size=6, source=source, color='blue', alpha=0.6) # p.line(dates, counts_avg, color='grey') p.title.text = "Tweet Frequency of @patrickbeekman's followers" p.xaxis.axis_label = 'Date' p.yaxis.axis_label = '# of Tweets' #show(p) return p print("Frequency of tweets graph created!")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--source', type=Path, help='Source dir', required=True) parser.add_argument('--videodataset', type=Path, default='data/dfdc_videos.pkl', help='Path to save the videos DataFrame') parser.add_argument('--batch', type=int, help='Batch size', default=64) args = parser.parse_args() ## Parameters parsing source_dir: Path = args.source videodataset_path: Path = args.videodataset batch_size: int = args.batch ## DataFrame if videodataset_path.exists(): print('Loading video DataFrame') df_videos = pd.read_pickle(videodataset_path) else: print('Creating video DataFrame') # Create ouptut folder videodataset_path.parent.mkdir(parents=True, exist_ok=True) # Index df_train_list = list() for idx, json_path in enumerate( tqdm(sorted(source_dir.rglob('metadata.json')), desc='Indexing')): df_tmp = pd.read_json(json_path, orient='index') df_tmp['path'] = df_tmp.index.map(lambda x: str( json_path.parent.relative_to(source_dir).joinpath(x))) df_tmp['folder'] = int(str(json_path.parts[-2]).split('_')[-1]) df_train_list.append(df_tmp) df_videos = pd.concat(df_train_list, axis=0, verify_integrity=True) # Save space del df_videos['split'] df_videos['label'] = df_videos['label'] == 'FAKE' df_videos['original'] = df_videos['original'].astype('category') df_videos['folder'] = df_videos['folder'].astype(np.uint8) # Collect metadata paths_arr = np.asarray( df_videos.path.map(lambda x: str(source_dir.joinpath(x)))) height_list = [] width_list = [] frames_list = [] with Pool() as pool: for batch_idx0 in tqdm(np.arange(start=0, stop=len(df_videos), step=batch_size), desc='Metadata'): batch_res = pool.map( extract_meta_av, paths_arr[batch_idx0:batch_idx0 + batch_size]) for res in batch_res: height_list.append(res[0]) width_list.append(res[1]) frames_list.append(res[2]) df_videos['height'] = np.asarray(height_list, dtype=np.uint16) df_videos['width'] = np.asarray(width_list, dtype=np.uint16) df_videos['frames'] = np.asarray(frames_list, dtype=np.uint16) print('Saving video DataFrame to {}'.format(videodataset_path)) df_videos.to_pickle(str(videodataset_path)) print('Real videos: {:d}'.format(sum(df_videos['label'] == 0))) print('Fake videos: {:d}'.format(sum(df_videos['label'] == 1)))
b = df_fluxo_de_pessoas['tarde'] / df_quantidade_de_dias_semana['tarde'] c = df_fluxo_de_pessoas['noite'] / df_quantidade_de_dias_semana['noite'] df_fluxo_medio_pessoas = pd.concat([a, b, c], axis=1, sort=False) ####CRIAÇÃO DO FLUXXOO df_fluxo_medio_pessoas.to_csv('Resource/fluxo_medio_pessoas.csv', sep=';', index=True) if __name__ == "__main__": ####CONCORRENTE #path_current = os.getcwd() #path_complet_with_csv = '/Dados_Cliente/concorrentes.csv' df_concorrente = pd.read_csv('Dados_Cliente/concorrentes.csv') novo_concorrente(df_concorrente) ###BAIRRRO #path_complet_with_csv = os.path.join(path_current +'/Dados_Cliente/populacao.json') populacao = pd.read_json('Dados_Cliente/populacao.json') populacao = pd.DataFrame(data=populacao) #path_complet_with_csv = os.path.join(path_current +'/Dados_Cliente/bairros.csv') bairro = pd.read_csv('Dados_Cliente/bairros.csv') novo_bairro(bairro, populacao) ###CHAMANDO EVENTOS #path_complet_with_csv = os.path.join(path_current +'/Dados_Cliente/eventos_de_fluxo.csv') eventos_fluxo = pd.read_csv('Dados_Cliente/eventos_de_fluxo.csv') calculo_dias(eventos_fluxo) print('Concluido')
# MONGO_COLLECTION = 'shop_high0727' client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] def save_to_mongo(result): """ 保存至MongoDB :param result: 结果 """ try: if db[MONGO_COLLECTION].insert(result): print('存储到MongoDB成功') except Exception: print('存储到MongoDB失败') df1 = pd.read_json('e-cigar-high.json') df2 = pd.read_json('e-cigar-low.json') df = df1.append(df2) shop_link = df['s_link'].value_counts().index.tolist() print('共有{}家店'.format(len(shop_link))) c = 0 for link in shop_link: c += 1 shop_id = link[link.find('id=')+3:] print('查找第{}号店铺'.format(c)) url = 'https://hdc1.alicdn.com/asyn.htm?userId='+shop_id res = requests.get(url).text ch2del = ('\\r\\n','\r\n','\\','\t','\n') for ch in ch2del: res = res.replace(ch,'') doc = pq(res)
def load_world_bank_health_n_pop(): """Loads the world bank health dataset, slices and a dashboard""" tbl_name = 'wb_health_population' with gzip.open(os.path.join(DATA_FOLDER, 'countries.json.gz')) as f: pdf = pd.read_json(f) pdf.columns = [col.replace('.', '_') for col in pdf.columns] pdf.year = pd.to_datetime(pdf.year) pdf.to_sql(tbl_name, db.engine, if_exists='replace', chunksize=50, dtype={ 'year': DateTime(), 'country_code': String(3), 'country_name': String(255), 'region': String(255), }, index=False) print("Creating table [wb_health_population] reference") tbl = db.session.query(TBL).filter_by(table_name=tbl_name).first() if not tbl: tbl = TBL(table_name=tbl_name) tbl.description = utils.readfile(os.path.join(DATA_FOLDER, 'countries.md')) tbl.main_dttm_col = 'year' tbl.is_featured = True tbl.database = get_or_create_main_db() db.session.merge(tbl) db.session.commit() tbl.fetch_metadata() defaults = { "compare_lag": "10", "compare_suffix": "o10Y", "datasource_id": "1", "datasource_name": "birth_names", "datasource_type": "table", "limit": "25", "granularity": "year", "groupby": [], "metric": 'sum__SP_POP_TOTL', "metrics": ["sum__SP_POP_TOTL"], "row_limit": config.get("ROW_LIMIT"), "since": "2014-01-01", "until": "2014-01-02", "where": "", "markup_type": "markdown", "country_fieldtype": "cca3", "secondary_metric": "sum__SP_POP_TOTL", "entity": "country_code", "show_bubbles": "y", } print("Creating slices") slices = [ Slice(slice_name="Region Filter", viz_type='filter_box', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, viz_type='filter_box', groupby=['region', 'country_name'])), Slice(slice_name="World's Population", viz_type='big_number', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, since='2000', viz_type='big_number', compare_lag="10", metric='sum__SP_POP_TOTL', compare_suffix="over 10Y")), Slice(slice_name="Most Populated Countries", viz_type='table', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, viz_type='table', metrics=["sum__SP_POP_TOTL"], groupby=['country_name'])), Slice(slice_name="Growth Rate", viz_type='line', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, viz_type='line', since="1960-01-01", metrics=["sum__SP_POP_TOTL"], num_period_compare="10", groupby=['country_name'])), Slice(slice_name="% Rural", viz_type='world_map', datasource_type='table', datasource_id=tbl.id, params=get_slice_json(defaults, viz_type='world_map', metric="sum__SP_RUR_TOTL_ZS", num_period_compare="10")), Slice(slice_name="Life Expectancy VS Rural %", viz_type='bubble', datasource_type='table', datasource_id=tbl.id, params=get_slice_json( defaults, viz_type='bubble', since="2011-01-01", until="2011-01-02", series="region", limit="0", entity="country_name", x="sum__SP_RUR_TOTL_ZS", y="sum__SP_DYN_LE00_IN", size="sum__SP_POP_TOTL", max_bubble_size="50", flt_col_1="country_code", flt_op_1="not in", flt_eq_1= "TCA,MNP,DMA,MHL,MCO,SXM,CYM,TUV,IMY,KNA,ASM,ADO,AMA,PLW", num_period_compare="10", )), Slice(slice_name="Rural Breakdown", viz_type='sunburst', datasource_type='table', datasource_id=tbl.id, params=get_slice_json( defaults, viz_type='sunburst', groupby=["region", "country_name"], secondary_metric="sum__SP_RUR_TOTL", since="2011-01-01", until="2011-01-01", )), Slice(slice_name="World's Pop Growth", viz_type='area', datasource_type='table', datasource_id=tbl.id, params=get_slice_json( defaults, since="1960-01-01", until="now", viz_type='area', groupby=["region"], )), Slice(slice_name="Box plot", viz_type='box_plot', datasource_type='table', datasource_id=tbl.id, params=get_slice_json( defaults, since="1960-01-01", until="now", whisker_options="Min/max (no outliers)", viz_type='box_plot', groupby=["region"], )), Slice(slice_name="Treemap", viz_type='treemap', datasource_type='table', datasource_id=tbl.id, params=get_slice_json( defaults, since="1960-01-01", until="now", viz_type='treemap', metrics=["sum__SP_POP_TOTL"], groupby=["region", "country_code"], )), Slice(slice_name="Parallel Coordinates", viz_type='para', datasource_type='table', datasource_id=tbl.id, params=get_slice_json( defaults, since="2011-01-01", until="2011-01-01", viz_type='para', limit=100, metrics=[ "sum__SP_POP_TOTL", 'sum__SP_RUR_TOTL_ZS', 'sum__SH_DYN_AIDS' ], secondary_metric='sum__SP_POP_TOTL', series="country_name", )), ] misc_dash_slices.append(slices[-1].slice_name) for slc in slices: merge_slice(slc) print("Creating a World's Health Bank dashboard") dash_name = "World's Bank Data" slug = "world_health" dash = db.session.query(Dash).filter_by(slug=slug).first() if not dash: dash = Dash() js = textwrap.dedent("""\ [ { "col": 1, "row": 0, "size_x": 2, "size_y": 2, "slice_id": "1231" }, { "col": 1, "row": 2, "size_x": 2, "size_y": 2, "slice_id": "1232" }, { "col": 10, "row": 0, "size_x": 3, "size_y": 7, "slice_id": "1233" }, { "col": 1, "row": 4, "size_x": 6, "size_y": 3, "slice_id": "1234" }, { "col": 3, "row": 0, "size_x": 7, "size_y": 4, "slice_id": "1235" }, { "col": 5, "row": 7, "size_x": 8, "size_y": 4, "slice_id": "1236" }, { "col": 7, "row": 4, "size_x": 3, "size_y": 3, "slice_id": "1237" }, { "col": 1, "row": 7, "size_x": 4, "size_y": 4, "slice_id": "1238" }, { "col": 9, "row": 11, "size_x": 4, "size_y": 4, "slice_id": "1239" }, { "col": 1, "row": 11, "size_x": 8, "size_y": 4, "slice_id": "1240" } ] """) l = json.loads(js) for i, pos in enumerate(l): pos['slice_id'] = str(slices[i].id) dash.dashboard_title = dash_name dash.position_json = json.dumps(l, indent=4) dash.slug = slug dash.slices = slices[:-1] db.session.merge(dash) db.session.commit()
import imageio from amftrack.pipeline.functions.image_processing.experiment_class_surf import ( Experiment, save_graphs, load_graphs, ) from amftrack.transfer.functions.transfer import upload, zip_file from amftrack.pipeline.functions.post_processing.extract_study_zone import ( load_study_zone, ) directory = str(sys.argv[1]) overwrite = eval(sys.argv[2]) i = int(sys.argv[-1]) op_id = int(sys.argv[-2]) run_info = pd.read_json(f"{temp_path}/{op_id}.json") list_f, list_args = pickle.load(open(f"{temp_path}/{op_id}.pick", "rb")) folder_list = list(run_info["folder_analysis"]) directory_name = folder_list[i] select = run_info.loc[run_info["folder_analysis"] == directory_name] row = [row for index, row in select.iterrows()][0] plate_num = row["Plate"] path_exp = f'{directory}{row["path_exp"]}' exp = pickle.load(open(path_exp, "rb")) try: exp.labeled except AttributeError: exp.labeled = True load_study_zone(exp)
import pandas as pd import spotipy from spotipy.oauth2 import SpotifyClientCredentials from collections import defaultdict client_credentials_manager = SpotifyClientCredentials( client_id="CLIENT_ID", client_secret="CLIENT_SECRET") sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) df = pd.read_json('raw/data.json') def get_audio_features(title, artist): print("Searching Spotify for", title, "by", artist) songs = sp.search(q='track:' + title + ' artist:' + artist + '*', type='track') items = songs['tracks']['items'] if len(items) > 0: print("Getting audio features") song_id = str(items[0]['id']) features = sp.audio_features(song_id)[0] if len(features) >= 18: return features print("None found")
def gera_box(): dados = pd.read_json(diretorio + jsonBox) print('\n Boxplot dos tweets:') dados.boxplot(column=['Polaridade', 'Subjetividade'], vert=False) mpl.show()
street_hash = feature_hash.get_feature_set(street_list) rent_frame.drop(text_columns,axis=1,inplace=True) numerical_features = rent_frame.values return numpy.hstack((numerical_features,description_hash,features_list_hash,address_hash,street_hash)) def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test): print '--------- For Model : ', trained_model_name, '--------------------' predicted_values = trained_model.predict(X_test) print metrics.classification_report(y_test,predicted_values) print "Accuracy Score : ",metrics.accuracy_score(y_test,predicted_values) print "---------------------------------------\n" filename = 'train.json' rent_frame = pd.read_json(filename) print rent_frame.columns print len(rent_frame) print rent_frame.head(3) target_variable = 'interest_level' columns_to_delete = ['building_id','created','listing_id','manager_id','photos','interest_level'] class_labels = list(rent_frame[target_variable].values) text_columns = ['description','features','display_address','street_address'] rent_frame.drop(columns_to_delete,axis=1,inplace=True) full_features = get_label_encoded_features(rent_frame,text_columns) X_train,X_test,y_train,y_test = train_test_split(full_features,class_labels,test_size=0.2,random_state=42) classifier_list,classifier_name_list = get_ensemble_models() for classifier,classifier_name in zip(classifier_list,classifier_name_list): classifier.fit(X_train,y_train) print_evaluation_metrics(classifier,classifier_name,X_test,y_test)