def run_demo(): #df=pd.read_sql('SELECT * FROM GAMEAPP;', con= db) (var,tp,timestart,timeend)=open() gym= time_list(var,tp,timestart,timeend) a= input('type:') a= a.lower() anly_dict=dict() for k in gym: anly_dict[k]=gym[k][a] data= pd.from_dict(anly_dict, orient='columns', dtype=None) data_other,data=cross_validation.train_test_split(data,test_size=0.001,random_state=10)#为了减少代码运行时间,方便测试 train_and_valid, test = cross_validation.train_test_split(data, test_size=0.2, random_state=10) train, valid = cross_validation.train_test_split(train_and_valid, test_size=0.01, random_state=10) train_feature, train_target = get_features_target(train) test_feature, test_target = get_features_target(test) valid_feature, valid_target = get_features_target(valid) params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls'} clf = ensemble.GradientBoostingRegressor(**params) clf.fit(train_feature, train_target) #训练 # mse = mean_squared_error(test_target, clf.predict(test_feature)) #预测并且计算MSE # print(mse) pre=clf.predict(test_feature) pre_list=list(pre) real_pre_zip=zip(test_target,pre_list) count=len(pre_list) error=rmspe(real_pre_zip,count) print(error)
def create_SP_500_member_df(): '''Creates the data frame that will be used for modeling with the column SP_500_member ==1 to when the stock is a member''' quarter_order, quarter_membership_lists = generate_sp_membership_list() df = load_quarterly() SP_500_member = np.zeros((df.shape[0],1)) df = df.reset_index(drop=True) for i, row in df.iterrows(): if row['quarter'].ordinal in quarter_order: if row['ticker'] in quarter_membership_lists[quarter_order.index(row['quarter'].ordinal)]: SP_500_member[i] = 1 df['SP_500_member'] = SP_500_member items_added=0 rows_added_dict = {'quarter':{},'ticker':{},'SP_500_member':{}} for quarter in df.quarter.unique(): for ticker in df.ticker.unique(): if ticker not in df[df.quarter == quarter].ticker.values: rows_added_dict['quarter'].update({items_added:quarter.ordinal}) rows_added_dict['ticker'].update({items_added:ticker}) rows_added_dict['SP_500_member'].update({items_added:0}) items_added+=1 df = pd.concat([df,pd.from_dict(rows_added_dict)]) return df
def __init__( self, data, time_field=None, min_time=None, max_time=None, pickable=True, opacity=1, title='', js_function_overrides={} ): super(Layer, self).__init__() if not isinstance(data, pd.DataFrame): data = pd.from_dict(data, orient='records') self.data = data class_name = self.__class__.__name__ # Layer name for deck.gl self.layer_type = class_name if 'Layer' in self.__class__.__name__ else class_name + 'Layer' self.js_function_overrides = js_function_overrides self.title = '' self.pickable = 'true' if pickable else 'false' self.id = '"%s"' % uuid.uuid4() if time_field is not None: try: times = data[time_field] except KeyError: raise Exception("Data does not have a time field named `%s`" % time_field) self.update_triggers = "{getColor: [timeFilter], getElevationValue: [timeFilter]}" self.time_field = time_field self.min_time = min(times) if time_field else None self.max_time = max(times) if time_field else None self.opacity = float(opacity)
def export_df(self): dickey_fullers = {} date_index = [] for k, v in self.regressions.items(): dickey_fullers[k] = v['dickey_fuller'] date_index.append(v['date_range'][0]) out_df = pd.from_dict(dickey_fuller) out_df.index = date_index return (out_df)
def infer(request): form = ModelForm(None) if request.method == "POST": form = ModelForm(request.POST) model_data = pd.from_dict(form.cleaned_data) mdl = pickle.load(r'..\models\model.sav') pred = mdl.predict(model_data) context = {"form": form} print(pred) return render(request, 'py2tableau/infer_model_data.html', context)
def calculate_centroid(df_raw, pdg_acc, cwd): # Calculate the centroid for each cluster subprocess.run('cat ' + pdg_acc + '.reference_target.SNP_distances.tsv | cut -f1,5,9,12 > ' + pdg_acc + '_selected_distance.tsv', shell=True, check=True) distance_file = pdg_acc + '_selected_distance.tsv' df_distance = pd.read_csv(distance_file, header=0, sep='\t') cluster_list = df_distance.groupby('PDS_acc').size().index.tolist() cluster_center_dict = {'PDS_acc': [], 'target_acc': []} # TODO: a custom folder name for the skesa assemblies maybe if os.path.exists(os.path.join(cwd, 'fasta')): pass else: os.mkdir(os.path.join(cwd, 'fasta')) for cluster in cluster_list: # get all isolates in this cluster df_cluster = df_distance.loc[df_distance['PDS_acc'] == cluster] # append the dataset with switched columns to get a "full" pairwise distance matrix df_append = df_cluster.append( df_cluster.rename(columns={ "target_acc_1": "target_acc_2", "target_acc_2": "target_acc_1" })) # add up all distances for one target and try downloading the skesa assembly for the one with minimum distances for target in df_append.groupby( 'target_acc_1')['delta_positions_unambiguous'].sum( ).sort_values().index.tolist(): SRR = str( df_raw.loc[df_raw['target_acc'] == target]['Run'].iloc[0]) if SRR == 'nan': continue try: # try downloading the skesa assembly, if failed turn to next genome in this cluster subprocess.check_call( "dump-ref-fasta http://sra-download.ncbi.nlm.nih.gov/srapub_files/" + SRR + "_" + SRR + ".realign > fasta/" + SRR + "_skeasa.fasta", shell=True, stderr=subprocess.DEVNULL) cluster_center_dict['PDS_acc'].append(cluster) cluster_center_dict['target_acc'].append(target) break except subprocess.CalledProcessError: continue # delete all the empty fasta files os.system('find . -size 0 -delete') df_cluster_center = pd.from_dict(cluster_center_dict) df_cluster_center.to_csv(pdg_acc + '_PDS_center.csv') return df_cluster_center
def segment_frame( self, data=None, seg_size=24 * 7, seg_strafe=24 * 1, fields='all', window='hamming', ): ''' Uses the function segment_dict with the same parameters. It simply first converts the data into a dictionary runs the function segment_dict and converts the result back into a data frame. ''' data = pd.from_dict( self.segment_dict( data.to_dict(), seg_size, seg_strafe, fields, window, )) return data
def recommend(self): self.find_nearest() for customer in self.customers: self.recommend(customer) return pd.from_dict(self.recommendations)
def main(): tqdm.pandas() jump_range = 21 data_path = "data/" if not data_path[-1] == "/" and not data_path[-1] == "\\": data_path = data_path + "/" if not os.path.exists(data_path): os.makedirs(data_path) filenames = [ "systems_populated.json", "listings.csv", "commodities.json", "stations.json" ] for name in filenames: if need_to_pull(data_path, name): pull_from_eddb(data_path, name) commodities = read_data(data_path, "commodities.json") prices = read_data(data_path, "listings.csv") systems = read_data(data_path, "systems_populated.json") stations = read_data(data_path, "stations.json") #print("Finding the Tritium Commodity ID") #for commodity in commodities: # if commodity["name"] == "Tritium": # tritium_id = commodity["id"] # break filtered_trades_stations = {} result = None print( "Trimming stations to only those that we can land at and have markets") for station in stations: if station["has_market"] and station[ "max_landing_pad_size"] == "L" and not station[ "type"] == "Fleet Carrier": station_id = station["id"] system_id = station["system_id"] if station_id == 3: print(station) filtered_trades_stations[station_id] = { "station_id": station_id, "system_id": system_id, "station_name": station["name"] } systems_df = pandas.from_dict(systems) systems_df = systems_df[["id", "name", "x", "y", "z"]] filtered_stations_df = pandas.from_dict(filtered_trades_stations, orient="index") print("Joining systems and stations") filtered_stations_systems = filtered_stations_df.merge(systems_df, how='left', left_on='system_id', right_on='id') filtered_stations_systems.drop("id", inplace=True, axis=1) print("Joining stations and prices") prices = prices[[ "station_id", "commodity_id", "supply", "buy_price", "sell_price", "demand" ]] prices = prices.merge(filtered_stations_systems, how='right', left_on='station_id', right_on='station_id', suffixes=("_prices", "_systems")).apply(lambda x: x) prices.dropna(how='any', inplace=True) prices = prices.astype({ "station_id": "uint32", "commodity_id": "uint16", "supply": "uint32", "buy_price": "uint64", "sell_price": "uint64", "demand": "uint32", "system_id": "uint64", "x": "float32", "y": "float32", "z": "float32" }) for commodity in tqdm(commodities, desc='Commodities'): commodity_id = commodity['id'] print("Filtering prices") stations_prices = prices[ (prices['station_id'].isin(filtered_trades_stations.keys())) & (prices['commodity_id'] == commodity_id)] station_prices_buy = stations_prices[ (stations_prices['buy_price'] > 0) & (stations_prices['supply'] > 5000)] station_prices_sell = stations_prices[ (stations_prices['sell_price'] > 0) & (stations_prices['demand'] > 0)] if not station_prices_buy.empty and not station_prices_sell.empty: print("Joining buy and sell stations") print(station_prices_buy.info()) print(station_prices_sell.info()) #station_prices_diff = station_prices_buy.merge(station_prices_sell,how='left',on='join_key',suffixes=('_buy','_sell')) station_prices_diff = cartesian_product_multi( *[station_prices_buy, station_prices_sell]) #station_prices_diff.columns = stations_prices.columns print(station_prices_diff.info()) print(station_prices_diff) print("Getting profit for each route") station_prices_diff['profit'] = station_prices_diff[ 'sell_price_sell'] - station_prices_diff['buy_price_buy'] print("Getting distance of each route") station_prices_diff[ 'route_distance'] = station_prices_diff.progress_apply( lambda row: get_distance(row["x_buy"], row["y_buy"], row[ "z_buy"], row["x_sell"], row["y_sell"], row["z_sell"]), axis=1) print("Getting jumps per route") station_prices_diff['route_jumps'] = np.ceil( station_prices_diff['route_distance'] / jump_range) print("Getting profit/jump") station_prices_diff[station_prices_diff['route_jumps'] == 0] = 1 station_prices_diff['profit_per_jump'] = station_prices_diff[ 'profit'] / station_prices_diff['route_jumps'] station_prices_diff.sort_values(by='profit_per_jump', ascending=False, inplace=True) station_prices_diff['commodity_id'] = commodity_id if result is None: result = station_prices_diff else: result = result.append(station_prices_diff) result.sort_values(by='profit_per_jump', ascending=False, inplace=True) commodities_df = pandas.DataFrame.from_dict(commodities) result = result.merge(commodities_df, how='left', left_on='commodity_id', right_on='id', suffixes=('_result', '_commodity')) pandas.set_option('display.max_columns', None) print(result[[ 'commodity_name', 'station_name_buy', 'name_buy', 'station_name_sell', 'name_sell', 'profit_per_jump', 'buy_price_buy', 'sell_price_sell', 'supply_buy', 'demand_sell' ]].head(10))
cols = [ c for c in train.columns if c not in [ 'is_churn', 'msno', 'gender', 'bd', 'transaction_date', 'membership_expire_date' ] ] from sklearn.model_selection import GridSearchCV param_test1 = {'max_depth': range(10, 100, 10)} gsearch1 = GridSearchCV(estimator=xgb.XGBClassifier( learning_rate=0.02, n_estimators=140, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=20, scale_pos_weight=1, seed=27), param_grid=param_test1, scoring='neg_log_loss', n_jobs=20, iid=False, cv=5) gsearch1.fit(train[cols], train['is_churn']) gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_ sortie = pd.from_dict(gsearch1.best_params_) sortie.to_csv('bestParam.csv')
C.append(10e6 * i) C.append(10e7 * i) gamma.append(10e-1 * i) gamma.append(10e-2 * i) gamma.append(10e-3 * i) gamma.append(10e-4 * i) gamma.append(10e1 * i) gamma.append(10e2 * i) gamma.append(10e3 * i) gamma.append(10e4 * i) gamma.append(10e5 * i) gamma.append(10e6 * i) parameters = {'C' : C, 'gamma' : gamma} svc = SVC(kernel='rbf') clf = GridSearchCV(svc, parameters, cv=5 , verbose = 1, scoring = make_scorer(accuracy_score)) clf.fit(X_train, y_train) results = clf.cv_results_ df = pd.from_dict(results) df.to_csv("GridSearchResults.csv") print (clf.get_params()) y_pred = clf.predict(X_test) print (np.sum(y_pred == y_test) / y_pred.shape[0])
def graph_reduction(f_from, f_to): # read all nodes and edges f = open(f_from, 'r') spl = f.readlines() edge_weight = [] u = [] v = [] # only take corr < 0.7 for i in range(len(spl)): (src, dst, correl) = (spl[i].rstrip()).split('\t') if float(correl) < limit: band = int(float(correl) / step + 1) edge_weight.append((src, dst, str(band))) if src not in u: u.append(src) if dst not in v: v.append(dst) wg = nx.Graph() wg.add_weighted_edges_from(edge_weight) edges = wg.edges(data='weight') nodes_list = pd.DataFrame({'id': [], 'group': []}) for source, target, weight in edges: if source not in nodes_list['id'].values: nodes_list = nodes_list.append({ 'id': source, 'group': weight }, ignore_index=True) if target not in nodes_list['id'].values: nodes_list = nodes_list.append({ 'id': target, 'group': weight }, ignore_index=True) """ create cluster.json """ nodes_str = "" count = 0 for i, r in nodes_list.iterrows(): nodes_str = nodes_str + "{\"id\":\"" + r['id'] + "\", \"score\":" + r[ 'group'] + "}," count += 1 nodes_str = nodes_str.rstrip(',') links_str = "" for i, j, k in edges: source_index = np.where(nodes_list['id'] == i) target_index = np.where(nodes_list['id'] == j) links_str = links_str + "{\"source\":" + str( source_index[0][0]) + ", \"target\":" + str( target_index[0][0]) + "}," links_str = links_str.rstrip(',') json_str = "{\"nodes\":[" + nodes_str + "], \"links\":[" + links_str + "]}" parsed = json.loads(json_str) json.dump(parsed, open(f_to, 'w'), indent=4) dg = nx.degree_centrality(wg) degree_centrality = pd.from_dict(dg, orient='columns') #http://bl.ocks.org/AMDS/4a61497182b8fcb05906 f_ticker_profile = '../data/tickers_prof.tsv' tickers_prof = pd.read_csv(f_ticker_profile, sep='\t', index_col=0) for key in degree_centrality: tickers_prof['degree'] = np.where(tickers_prof['ticker'] == key, degree_centrality[key])
def statistical_test(self): df = pd.from_dict(self.vec_negative) print(df.head())
def next(intercept, mean, std): new_mean = mean + np.random.normal(0, 0.1, 1)[0] new_intercept = intercept + np.random.normal(0, 0.1, 1)[0] new_std = max(0, std + np.random.normal(0, 0.1, 1)[0]) # print('new:') # print(new_mean, new_intercept, new_std) log_a = probability(new_mean, new_intercept, new_std) - probability( mean, intercept, std) if log_a >= 0: mean, intercept, std = new_mean, new_intercept, new_std else: u = np.random.uniform(0, 1, 1) if math.log(u) <= log_a: mean, intercept, std = new_mean, new_intercept, new_std print(mean, intercept, std) return round(mean, 1), round(intercept, 1), round(std, 1) mean, intercept, std = 0, 0, 1 dist_mean = defaultdict(int) dist_std = defaultdict(int) for i in range(200): mean, intercept, std = next(mean, intercept, std) dist_mean[round(mean, 1)] += 1 print(dist_mean) out_df = pandas.from_dict(dist_mean)
# ..... # Extract the code from your webapp response # code = request.get('code') # or whatever your framework does # access_token = client.exchange_code_for_token(client_id=22120, # client_secret='<client_secret>', code=code) client.access_token = access_token if (os.path.isfile('raw_strava_data--------------.csv')): strava_data = pd.read_csv('raw_strava_data.csv') strava_data['start_date_local'] = pd.to_datetime( strava_data['start_date_local']) activities = client.get_activities(strava_data['start_date_local'].max()) for activity in activities: strava_data.append(pd.from_dict(activity.to_dict()), ignore_index=True) stava_data.to_csv('raw_strava_data.csv') else: activities = client.get_activities() types = ['time', 'latlng', 'altitude', 'heartrate', 'temp'] headers_written = False #stream_types = ['time', 'latlng', 'altitude', 'heartrate', 'temp'] stream_types = ['heartrate'] with open('raw_strava_data.csv', 'w') as f: for activity in activities: streams = client.get_activity_streams(activity.id, types=stream_types, resolution='medium') temp = activity.to_dict() for k in types: if k in streams:
] for idx, val in enumerate(curr_vals): if (val in component_defs.keys() and len(component_defs[val]) == 1): curr_vals[idx] = component_defs[val][0] component_defs[curr_key] = curr_vals for key, val in component_defs.items(): # print(key, val) val_str = ''.join(component_defs[key]) component_defs[key] = val_str print('{} --> {}'.format(key, val_str)) # Read the dictionary of macro defs into a Pandas DataFrame & write it to csv col_names = ['Macro', 'Definition'] df = pd.from_dict(component_defs, orient='columns', dtype=str, columns=col_names) out_path = r"C:\Users\nich980\Documents\Docs\Hector\coding-notes" f_name_out = "component_names.csv" out_abs = join(out_path, f_name_out) df.to_csv(out_abs, sep=',', header=True, index=False)