tb2.columns = ['回归系数', '标准误差SE', 'Z值', 'p值', '95%CI(下限)', '95%CI(上限)'] tb2 = tb2.rename(index={'Intercept': '常数'}) tb3 = pd.Series({ '自由度': res.df_model, 'F-值': res.fvalue, 'p-值': res.f_pvalue }).to_frame(name='回归总体ANOVA') s4 = pd.Series(res.predict(), name=str(y.name) + '-拟合') if curve_type in ['指数曲线', '复合曲线', '增长曲线', 'S型曲线']: s4 = np.exp(s4) tb4 = y.to_frame().join(s4) return {'模型汇总': tb1, '回归系数汇总表': tb2, 'ANOVA表格': tb3, '实际值与拟合值': tb4} if __name__ == '__mian__': def func(x, a, b, c): return a * np.exp(-b * x) + c x = np.linspace(1, 5, 50) y = func(x, 2.5, 1.3, 0.5) x = pd.Series(x, name='时间') y = pd.Series(y, name='药物浓度') rs = core(x, y, curve_type='对数曲线') rs.get('实际值与拟合值').plot()
return i['name'] return np.nan smd['director'] = smd['crew'].apply(get_director) smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []) smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >= 3 else x) smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []) smd['cast'] = smd['cast'].apply( lambda x: [str.lower(i.replace(" ", "")) for i in x]) smd['director'] = smd['director'].astype('str').apply( lambda x: str.lower(x.replace(" ", ""))) smd['director'] = smd['director'].apply(lambda x: [x, x, x]) s = smd.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True) s.name = 'keyword' s = s.value_counts() # print(s[:5]) s = s[s > 1] stemmer = SnowballStemmer('english') print(stemmer.stem('dogs')) print(111111111111111) def filter_keywords(x): words = [] for i in x: if i in s: words.append(i)
temp2.write('\n') continue elif len(final) == 10: #dta_hamilton = pd.Series(final, index=pd.date_range('1951-04-01', '1953-07-01', freq='QS')) #mod_hamilton = sm.tsa.MarkovAutoregression(dta_hamilton, k_regimes=2, order=4, switching_ar=False) #res_hamilton = mod_hamilton.fit() #print("FILTERED_MARGINAL_PROBABILITY\n") #print(res_hamilton.filtered_marginal_probabilities[0]) temp2.write(str(compid[i]) + '\t') for k in range(0, comp_yr_match[i]): temp2.write(str(compyr[l]) + '\t') l = l + 1 print('\n') continue elif len(final) == 11: dta_hamilton = pd.Series(final, index=pd.date_range('1951-04-01', '1953-10-01', freq='QS')) mod_hamilton = sm.tsa.MarkovAutoregression(dta_hamilton, k_regimes=2, order=4, switching_ar=False) res_hamilton = mod_hamilton.fit() print("FILTERED_MARGINAL_PROBABILITY\n") temp2.write(str(compid[i]) + '\t') for k in range(0, comp_yr_match[i]): temp2.write(str(compyr[l]) + '\t') l = l + 1 temp2.write('\n') temp2.write('\t') for j in range(0, len(res_hamilton.filtered_marginal_probabilities[0])): temp2.write(str(res_hamilton.filtered_marginal_probabilities[0][j]) + '\t') temp2.write('\n') elif len(final) == 12: dta_hamilton = pd.Series(final, index=pd.date_range('1951-04-01', '1954-01-01', freq='QS')) mod_hamilton = sm.tsa.MarkovAutoregression(dta_hamilton, k_regimes=2, order=4, switching_ar=False)
def get_rolling_mean(df, window=20): return pd.Series(df).rolling(window).mean()
def get_rolling_std(df, window=20): return pd.Series(df).rolling(window).std()
# In[ ]: # Logistic Regression logreg = LogisticRegression() logreg.fit(X_train, Y_train) Y_pred = logreg.predict(X_test) acc_log = round(logreg.score(X_train, Y_train) * 100, 2) acc_log # In[ ]: coeff_df = pd.DataFrame(train_df.columns.delete(0)) coeff_df.columns = ['Feature'] coeff_df["Correlation"] = pd.Series(logreg.coef_[0]) coeff_df.sort_values(by='Correlation', ascending=False) # ## Support Vector Machines # In[ ]: svc = SVC() svc.fit(X_train, Y_train) Y_pred = svc.predict(X_test) acc_svc = round(svc.score(X_train, Y_train) * 100, 2) acc_svc # ## k-Nearest Neighbors
def map_wafertt_majority(data): tt_bin = data['tt_bin'].value_counts().index[0] data['wafer_tt_bin'] = pd.Series([tt_bin for x in data.index], index=data.index) return data
def generate_weights(symbol_probs, literals_to_symbols, extras=0): ixs = pd.Series(literals_to_symbols) probs = symbol_probs.loc[ixs, 'prob'].values nprobs = symbol_probs.loc[ixs, 'nprob'].values probs = np.r_[nprobs[::-1], [0] * extras, [1] * extras, probs] return probs
plt.figure() sns.distplot(error_df['percent_difference'], bins=30, kde=False) plt.show() plt.close() # %% # now I want to find the root mean squared error for each compound versus # other calibration curves # rmse_dict is going to store each compound's rmse versus all the other compounds # key = compound, value = DataFrame of rmse for other compounds from math import sqrt rmse_dict = {compound: pd.DataFrame() for compound in reg_df.index} for target_compound in reg_df.index: # x axis new_x = pd.Series( [1, 2, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000]) new_x = np.log10(new_x) # print(new_x) # list of all the compounds reg_columns = list(reg_df.index) length = len(reg_columns) - 1 # mask to find the compound of interest mask = new_data_melt['compound'] == target_compound reg_mask = reg_df.index == target_compound # temporary dataframe to store the rmse values for all the other compounds temp = pd.DataFrame() # calculates the root mean squared value for the targeted compound
def run_experiment(): """Runs the training experiment.""" try: tf.io.gfile.makedirs( os.path.join(FLAGS.root_output_dir, FLAGS.exp_name)) except tf.errors.OpError: pass train_set, validation_set, test_set = ( dataset.construct_word_level_datasets( vocab_size=FLAGS.vocab_size, batch_size=FLAGS.batch_size, client_epochs_per_round=1, max_seq_len=FLAGS.sequence_length, max_training_elements_per_user=-1, shuffle_buffer_size=None, num_validation_examples=FLAGS.num_validation_examples, num_test_examples=FLAGS.num_test_examples)) train_set = (train_set.create_tf_dataset_from_all_clients().shuffle( FLAGS.shuffle_buffer_size)) recurrent_model = tf.keras.layers.LSTM if FLAGS.lstm else tf.keras.layers.GRU def _layer_fn(): return recurrent_model(FLAGS.latent_size, return_sequences=True) model = models.create_recurrent_model( FLAGS.vocab_size, FLAGS.embedding_size, FLAGS.num_layers, _layer_fn, 'stackoverflow-recurrent', shared_embedding=FLAGS.shared_embedding) logging.info('Training model: %s', model.summary()) optimizer = utils_impl.create_optimizer_from_flags('centralized') model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=optimizer, weighted_metrics=['acc']) train_results_path = os.path.join(FLAGS.root_output_dir, FLAGS.exp_name, 'train_results') test_results_path = os.path.join(FLAGS.root_output_dir, FLAGS.exp_name, 'test_results') train_csv_logger = AtomicCSVLogger(train_results_path) test_csv_logger = AtomicCSVLogger(test_results_path) log_dir = os.path.join(FLAGS.root_output_dir, 'logdir', FLAGS.exp_name) try: tf.io.gfile.makedirs(log_dir) tf.io.gfile.makedirs(train_results_path) tf.io.gfile.makedirs(test_results_path) except tf.errors.OpError: pass # log_dir already exists. train_tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=log_dir, write_graph=True, update_freq=FLAGS.tensorboard_update_frequency) test_tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir) results_file = os.path.join(FLAGS.root_output_dir, FLAGS.exp_name, 'results.csv.bz2') # Write the hyperparameters to a CSV: hparam_dict = collections.OrderedDict([(name, FLAGS[name].value) for name in hparam_flags]) hparam_dict['results_file'] = results_file hparams_file = os.path.join(FLAGS.root_output_dir, FLAGS.exp_name, 'hparams.csv') utils_impl.atomic_write_to_csv(pd.Series(hparam_dict), hparams_file) class_weight = dataset.get_class_weight(FLAGS.vocab_size) model.fit(train_set, epochs=FLAGS.epochs, verbose=1, class_weight=class_weight, validation_data=validation_set, callbacks=[train_csv_logger, train_tensorboard_callback]) score = model.evaluate( test_set, verbose=1, callbacks=[test_csv_logger, test_tensorboard_callback]) logging.info('Final test loss: %.4f', score[0]) logging.info('Final test accuracy: %.4f', score[1])
from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import make_classification X, y = make_classification( n_samples=ERROR, n_features=20, n_informative=20, n_redundant=0, n_classes=5, random_state=42 ) start = time.time() rf = RandomForestClassifier( n_estimators=500, random_state=42, n_jobs=1 ) print("Fitting model...") rf.fit(X, y) end = time.time() print("Number of seconds this model took to fit:") print(end - start) y_pred = pd.Series(rf.predict(X), name="predictions") y_pred.to_csv("output.csv", header=True)
def on_epoch_end(self, epoch, logs=None): epoch_path = os.path.join(self._path, 'results.{:02d}.csv'.format(epoch)) utils_impl.atomic_write_to_csv(pd.Series(logs), epoch_path)
try: page_filter = requests.get(site_filter) except: raise ConnectionError("Check your Internet Connection") soup = BeautifulSoup(page_filter.content, "html.parser") body_filter = soup.find("body") movie_col = body_filter.find(class_ = "mv-row") all_movies = movie_col.find_all('div', {"data-selector" : True}) events = movie_col.find_all('ul', {"class" : "rating-stars"}) ## Mask for only movies in now_showing mask = [movie["data-language-filter"]!="" for movie in all_movies] movies = [all_movies[i]["data-search-filter"] for i in range(len(all_movies)) if mask[i]] ## Movie names used for creating link to the movies all_movies_name = list(map(movie_name, movies)) #print(all_movies_name) ## Actual movies names movies_names = pd.Series([events[i]["event-name"] for i in range(len(events)) if mask[i]]) event_codes = [events[i]["event-code"] for i in range(len(events)) if mask[i]] print(movies_names) ##print(event_codes)
def test_astype_str(self, data): result = pd.Series(data[:5]).astype(str) expected_dtype = SparseDtype(str, str(data.fill_value)) expected = pd.Series([str(x) for x in data[:5]], dtype=expected_dtype) self.assert_series_equal(result, expected)
def core(x, y, curve_type='二次曲线'): ''' x: pd.Series y: pd.Series curve_types = ['二次曲线', '三次曲线', '对数曲线', '指数曲线', '复合曲线', '增长曲线', 'S型曲线'] ''' df = x.to_frame().join(y) df = sm.add_constant(df, prepend=True) df = df.rename(columns={'const': '常数项'}) if curve_type == '二次曲线': formula = '%s ~ %s + %s + %s**2' % (y.name, '常数项', x.name, x.name) if curve_type == '三次曲线': formula = '%s ~ %s + %s + %s**2 + %s**3' % (y.name, '常数项', x.name, x.name, x.name) if curve_type == '对数曲线': formula = '%s ~ %s + np.log(%s)' % (y.name, '常数项', x.name) if curve_type == '指数曲线': formula = 'np.log(%s) ~ np.log(%s) + %s' % (y.name, '常数项', x.name) if curve_type == '复合曲线': formula = 'np.log(%s) ~ np.log(%s) + np.log(%s)*%s' % (y.name, '常数项', '常数项', x.name) if curve_type == '增长曲线': formula = 'np.log(%s) ~ %s + %s' % (y.name, '常数项', x.name) if curve_type == 'S型曲线': def _r(x): return 1 / x formula = 'np.log(%s) ~ %s + _r(%s)' % (y.name, '常数项', x.name) res = ols(formula, df).fit() tables = res.summary().tables df_list = [pd.read_html(StringIO(t.as_html()))[0] for t in tables] td = { 'R平方': res.rsquared, '调整R方': res.rsquared_adj, '标准误': res.mse_model, 'AIC': res.aic, 'BIC': res.aic, '有效样本': res.nobs } tb1 = pd.Series(td).to_frame(name='模型汇总').T tb2 = df_list[1].set_index(0).iloc[1:].loc[['Intercept', x.name]] tb2.columns = ['回归系数', '标准误差SE', 'Z值', 'p值', '95%CI(下限)', '95%CI(上限)'] tb2 = tb2.rename(index={'Intercept': '常数'}) tb3 = pd.Series({ '自由度': res.df_model, 'F-值': res.fvalue, 'p-值': res.f_pvalue }).to_frame(name='回归总体ANOVA') s4 = pd.Series(res.predict(), name=str(y.name) + '-拟合') if curve_type in ['指数曲线', '复合曲线', '增长曲线', 'S型曲线']: s4 = np.exp(s4) tb4 = y.to_frame().join(s4) return {'模型汇总': tb1, '回归系数汇总表': tb2, 'ANOVA表格': tb3, '实际值与拟合值': tb4}
__package__ = None _logger = logging.getLogger(__name__) index = pa.Index([datetime(1991, 12, 31), datetime(1992, 1, 1), datetime(1992, 1, 2), datetime(1992, 1, 3), datetime(1992, 1, 6), datetime(1992, 1, 7), datetime(1992, 1, 8), datetime(1992, 1, 9), datetime(1992, 1, 10), datetime(1992, 1, 13)]) data = pa.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], index=index, dtype=float) datanode = mdf.datanode("data", data) df = pa.DataFrame({"A" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "B" : [np.nan, 9, 8, np.nan, 6, 5, 4, 3, 2, 1]}, index=index, dtype=float) dfnode = mdf.datanode("df", df) filter = pa.Series([False, True, False, True, False, True, False, True, False, True], index=index) filternode = mdf.datanode("filter", filter) queue_expected = [data[data.index <= d].tolist() for d in index] queue_filtered_expected = [data[filter][data[filter].index <= d].tolist() for d in index] delay_expected = data.shift(1)
agent_spec = { "type": "a2c", "learning_rate": 0.0003, "discount": 1.0, "estimate_terminal": False, "max_episode_timesteps": 20000,#理解成在update之前至少多少个timestep "network": network_spec, "batch_size": 100, "update_frequency":100 } environment = TradingEnvironment(exchange=exchange, action_strategy=action_strategy, reward_strategy=reward_strategy, feature_pipeline=feature_pipeline) strategy = TensorforceTradingStrategy(environment = environment, agent_spec = agent_spec, save_best_agent = False) #%%Start Over performance = strategy.run(episodes=100, evaluation=False) r = pd.Series(strategy._runner.episode_rewards) #manually store agent #strategy.save_agent(directory = 'save/', filename = '01') #%% Restore and Continue ''' strategy.restore_agent(directory = 'save/', filename = 'best-model') performance = strategy.run(episodes=(strategy._runner.agent.episodes + 20), evaluation=False) '''
with open('poetry.txt','r', encoding='UTF-8') as f: raw_text = f.read() lines = raw_text.split("\n")[:-1] poem_text = [i.split(':')[1] for i in lines] char_list = [re.findall('[\x80-\xff]{3}|[\w\W]', s) for s in poem_text] # In[3]: # 汉字 <-> 数字 映射 all_words = [] for i in char_list: all_words.extend(i) word_dataframe = pd.DataFrame(pd.Series(all_words).value_counts()) word_dataframe['id'] = list(range(1,len(word_dataframe)+1)) word_index_dict = word_dataframe['id'].to_dict() index_dict = {} for k in word_index_dict: index_dict.update({word_index_dict[k]:k}) len(all_words), len(word_dataframe), len(index_dict) # In[4]: # 生成训练数据, x 为 前两个汉字, y 为 接下来的汉字 # 如: 明月几时有 会被整理成下面三条数据
args=[df_PDO.loc[0]['TrainIsTrue']], result_type='broadcast') # Butter Lowpass dates = df_PDOsplit.index freqraw = (dates[1] - dates[0]).days ls = ['solid', 'dotted', 'dashdot', 'dashed'] fig, ax = plt.subplots(1,1, figsize=(10,5)) list_dfPDO = [df_PDOsplit] lowpass_yrs = [.25, .5, 1.0, 2.0] for i, yr in enumerate(lowpass_yrs): window = int(yr*functions_pp.get_oneyr(dates).size) # 2 year if i ==0: ax.plot_date(dates, df_PDOsplit.values, label=f'Raw ({freqraw} day means)', alpha=.3, linestyle='solid', marker=None) df_PDObw = pd.Series(filters.lowpass(df_PDOsplit, period=window).squeeze(), index=dates, name=f'PDO{yr}bw') ax.plot_date(dates, df_PDObw, label=f'Butterworth {yr}-year low-pass', color='red',linestyle=ls[i], linewidth=1, marker=None) df_PDOrm = df_PDOsplit.rolling(window=window, closed='right', min_periods=window).mean() df_PDOrm = df_PDOrm.rename({'PDO':f'PDO{yr}rm'}, axis=1) ax.plot_date(dates, df_PDOrm, label=f'Rolling mean {yr}-year low-pass (closed right)', color='green',linestyle=ls[i], linewidth=1, marker=None) list_dfPDO.append(df_PDObw) ; list_dfPDO.append(df_PDOrm) ax.legend() filepath = os.path.join(path_out_main, 'Low-pass_filter.pdf') plt.savefig(filepath, bbox_inches='tight') df_PDOs = pd.concat(list_dfPDO,axis=1) functions_pp.store_hdf_df({'df_data':df_PDOs},
##历史数据 # ##个人成交最低价 dta= [10000, 11600, 14000, 18700, 23900, 10000,10700, 11500, 11600, 12300 , 13100, 14500, 16000, 14800,10300, 12000, 13200, 14500, 16800, 20200, 26000, 35000, 10000, 13500, 16000, 19000, 24600, 21000, 19500, 21100, 23800, 25000, 23400, 10100, 15100, 18100, 19500, 18600 , 15300, 15100, 15900, 17300, 18900, 21500, 23900, 25000, 26800, 28800, 30000, 30700, 18000 , 21000, 22800, 25300, 32100] ##个人平均成交价 # dta = [22822, 14138, 11067, 15743, 20631, 26599, 30802, 16384, 12777 # , 14074, 15391, 17024, 18310, 16654, 12382,13137, 14331, 15436, 17798, 21506, 27672 # , 37805, 36231, 16886, 17487, 20609, 27077, 25727, 21884, 23315, 25701, 27127, 28541, 24324, 17330, 19614, # 21551, 22300, 20591, 17508, 17419, 18358, 20127, 22996, 25498, 26668, 28561, 30535, 32449, 34046, 32312, # 25213, 24560, 26939, 34455] dta=np.array(dta,dtype=np.float) dta=pd.Series(dta) dta.index = pd.Index(sm.tsa.datetools.dates_from_range('2001','2055')) dta.plot(figsize=(12,8)) fig = plt.figure(figsize=(12,8)) ax1= fig.add_subplot(111) ##差分运算 diff1 = dta.diff(1) diff1.plot(ax=ax1) fig = plt.figure(figsize=(12,8)) ax2= fig.add_subplot(111) diff2 = dta.diff(2) diff2.plot(ax=ax2) ## diff1= dta.diff(1) fig = plt.figure(figsize=(12,8)) ax1=fig.add_subplot(211)