tb2.columns = ['回归系数', '标准误差SE', 'Z值', 'p值', '95%CI(下限)', '95%CI(上限)']
    tb2 = tb2.rename(index={'Intercept': '常数'})

    tb3 = pd.Series({
        '自由度': res.df_model,
        'F-值': res.fvalue,
        'p-值': res.f_pvalue
    }).to_frame(name='回归总体ANOVA')

    s4 = pd.Series(res.predict(), name=str(y.name) + '-拟合')
    if curve_type in ['指数曲线', '复合曲线', '增长曲线', 'S型曲线']:
        s4 = np.exp(s4)

    tb4 = y.to_frame().join(s4)

    return {'模型汇总': tb1, '回归系数汇总表': tb2, 'ANOVA表格': tb3, '实际值与拟合值': tb4}


if __name__ == '__mian__':

    def func(x, a, b, c):
        return a * np.exp(-b * x) + c

    x = np.linspace(1, 5, 50)
    y = func(x, 2.5, 1.3, 0.5)

    x = pd.Series(x, name='时间')
    y = pd.Series(y, name='药物浓度')

    rs = core(x, y, curve_type='对数曲线')
    rs.get('实际值与拟合值').plot()
Exemple #2
0
            return i['name']
    return np.nan


smd['director'] = smd['crew'].apply(get_director)
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x]
                                if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >= 3 else x)
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x]
                                        if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(
    lambda x: [str.lower(i.replace(" ", "")) for i in x])
smd['director'] = smd['director'].astype('str').apply(
    lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x, x, x])
s = smd.apply(lambda x: pd.Series(x['keywords']),
              axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
# print(s[:5])
s = s[s > 1]
stemmer = SnowballStemmer('english')
print(stemmer.stem('dogs'))
print(111111111111111)


def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
Exemple #3
0
     temp2.write('\n')
     continue
 elif len(final) == 10:
     #dta_hamilton = pd.Series(final, index=pd.date_range('1951-04-01', '1953-07-01', freq='QS'))
     #mod_hamilton = sm.tsa.MarkovAutoregression(dta_hamilton, k_regimes=2, order=4, switching_ar=False)
     #res_hamilton = mod_hamilton.fit()
     #print("FILTERED_MARGINAL_PROBABILITY\n")
     #print(res_hamilton.filtered_marginal_probabilities[0])
     temp2.write(str(compid[i]) + '\t')
     for k in range(0, comp_yr_match[i]):
         temp2.write(str(compyr[l]) + '\t')
         l = l + 1
     print('\n')
     continue
 elif len(final) == 11:
     dta_hamilton = pd.Series(final, index=pd.date_range('1951-04-01', '1953-10-01', freq='QS'))
     mod_hamilton = sm.tsa.MarkovAutoregression(dta_hamilton, k_regimes=2, order=4, switching_ar=False)
     res_hamilton = mod_hamilton.fit()
     print("FILTERED_MARGINAL_PROBABILITY\n")
     temp2.write(str(compid[i]) + '\t')
     for k in range(0, comp_yr_match[i]):
         temp2.write(str(compyr[l]) + '\t')
         l = l + 1
     temp2.write('\n')
     temp2.write('\t')
     for j in range(0, len(res_hamilton.filtered_marginal_probabilities[0])):
         temp2.write(str(res_hamilton.filtered_marginal_probabilities[0][j]) + '\t')
     temp2.write('\n')
 elif len(final) == 12:
     dta_hamilton = pd.Series(final, index=pd.date_range('1951-04-01', '1954-01-01', freq='QS'))
     mod_hamilton = sm.tsa.MarkovAutoregression(dta_hamilton, k_regimes=2, order=4, switching_ar=False)
def get_rolling_mean(df, window=20):
    return pd.Series(df).rolling(window).mean()
def get_rolling_std(df, window=20):
    return pd.Series(df).rolling(window).std()
# In[ ]:

# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

# In[ ]:

coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

# ## Support Vector Machines

# In[ ]:

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

# ## k-Nearest Neighbors
 def map_wafertt_majority(data):
     tt_bin = data['tt_bin'].value_counts().index[0]
     data['wafer_tt_bin'] = pd.Series([tt_bin for x in data.index],
                                      index=data.index)
     return data
Exemple #8
0
def generate_weights(symbol_probs, literals_to_symbols, extras=0):
    ixs = pd.Series(literals_to_symbols)
    probs = symbol_probs.loc[ixs, 'prob'].values
    nprobs = symbol_probs.loc[ixs, 'nprob'].values
    probs = np.r_[nprobs[::-1], [0] * extras, [1] * extras, probs]
    return probs
Exemple #9
0
plt.figure()
sns.distplot(error_df['percent_difference'], bins=30, kde=False)
plt.show()
plt.close()

# %%
# now I want to find the root mean squared error for each compound versus
# other calibration curves
# rmse_dict is going to store each compound's rmse versus all the other compounds
# key = compound, value = DataFrame of rmse for other compounds
from math import sqrt
rmse_dict = {compound: pd.DataFrame() for compound in reg_df.index}

for target_compound in reg_df.index:
    # x axis
    new_x = pd.Series(
        [1, 2, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000])
    new_x = np.log10(new_x)
    # print(new_x)

    # list of all the compounds
    reg_columns = list(reg_df.index)
    length = len(reg_columns) - 1

    # mask to find the compound of interest
    mask = new_data_melt['compound'] == target_compound
    reg_mask = reg_df.index == target_compound

    # temporary dataframe to store the rmse values for all the other compounds
    temp = pd.DataFrame()

    # calculates the root mean squared value for the targeted compound
Exemple #10
0
def run_experiment():
    """Runs the training experiment."""
    try:
        tf.io.gfile.makedirs(
            os.path.join(FLAGS.root_output_dir, FLAGS.exp_name))
    except tf.errors.OpError:
        pass

    train_set, validation_set, test_set = (
        dataset.construct_word_level_datasets(
            vocab_size=FLAGS.vocab_size,
            batch_size=FLAGS.batch_size,
            client_epochs_per_round=1,
            max_seq_len=FLAGS.sequence_length,
            max_training_elements_per_user=-1,
            shuffle_buffer_size=None,
            num_validation_examples=FLAGS.num_validation_examples,
            num_test_examples=FLAGS.num_test_examples))
    train_set = (train_set.create_tf_dataset_from_all_clients().shuffle(
        FLAGS.shuffle_buffer_size))

    recurrent_model = tf.keras.layers.LSTM if FLAGS.lstm else tf.keras.layers.GRU

    def _layer_fn():
        return recurrent_model(FLAGS.latent_size, return_sequences=True)

    model = models.create_recurrent_model(
        FLAGS.vocab_size,
        FLAGS.embedding_size,
        FLAGS.num_layers,
        _layer_fn,
        'stackoverflow-recurrent',
        shared_embedding=FLAGS.shared_embedding)
    logging.info('Training model: %s', model.summary())
    optimizer = utils_impl.create_optimizer_from_flags('centralized')
    model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
                  optimizer=optimizer,
                  weighted_metrics=['acc'])

    train_results_path = os.path.join(FLAGS.root_output_dir, FLAGS.exp_name,
                                      'train_results')
    test_results_path = os.path.join(FLAGS.root_output_dir, FLAGS.exp_name,
                                     'test_results')

    train_csv_logger = AtomicCSVLogger(train_results_path)
    test_csv_logger = AtomicCSVLogger(test_results_path)

    log_dir = os.path.join(FLAGS.root_output_dir, 'logdir', FLAGS.exp_name)
    try:
        tf.io.gfile.makedirs(log_dir)
        tf.io.gfile.makedirs(train_results_path)
        tf.io.gfile.makedirs(test_results_path)
    except tf.errors.OpError:
        pass  # log_dir already exists.

    train_tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=log_dir,
        write_graph=True,
        update_freq=FLAGS.tensorboard_update_frequency)

    test_tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

    results_file = os.path.join(FLAGS.root_output_dir, FLAGS.exp_name,
                                'results.csv.bz2')

    # Write the hyperparameters to a CSV:
    hparam_dict = collections.OrderedDict([(name, FLAGS[name].value)
                                           for name in hparam_flags])
    hparam_dict['results_file'] = results_file
    hparams_file = os.path.join(FLAGS.root_output_dir, FLAGS.exp_name,
                                'hparams.csv')
    utils_impl.atomic_write_to_csv(pd.Series(hparam_dict), hparams_file)

    class_weight = dataset.get_class_weight(FLAGS.vocab_size)

    model.fit(train_set,
              epochs=FLAGS.epochs,
              verbose=1,
              class_weight=class_weight,
              validation_data=validation_set,
              callbacks=[train_csv_logger, train_tensorboard_callback])
    score = model.evaluate(
        test_set,
        verbose=1,
        callbacks=[test_csv_logger, test_tensorboard_callback])
    logging.info('Final test loss: %.4f', score[0])
    logging.info('Final test accuracy: %.4f', score[1])
Exemple #11
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=ERROR,
    n_features=20,
    n_informative=20,
    n_redundant=0,
    n_classes=5,
    random_state=42
)

start = time.time()

rf = RandomForestClassifier(
    n_estimators=500,
    random_state=42,
    n_jobs=1
)

print("Fitting model...")
rf.fit(X, y)

end = time.time()

print("Number of seconds this model took to fit:")
print(end - start)

y_pred = pd.Series(rf.predict(X), name="predictions")
y_pred.to_csv("output.csv", header=True)
Exemple #12
0
 def on_epoch_end(self, epoch, logs=None):
     epoch_path = os.path.join(self._path,
                               'results.{:02d}.csv'.format(epoch))
     utils_impl.atomic_write_to_csv(pd.Series(logs), epoch_path)
try:
    page_filter = requests.get(site_filter)
except:
    raise ConnectionError("Check your Internet Connection")
    
soup = BeautifulSoup(page_filter.content, "html.parser")

body_filter = soup.find("body")

movie_col = body_filter.find(class_ = "mv-row")

all_movies = movie_col.find_all('div', {"data-selector" : True})
    
events = movie_col.find_all('ul', {"class" : "rating-stars"})

## Mask for only movies in now_showing
mask = [movie["data-language-filter"]!="" for movie in all_movies]

movies = [all_movies[i]["data-search-filter"] for i in range(len(all_movies)) if mask[i]]

## Movie names used for creating link to the movies
all_movies_name = list(map(movie_name, movies))
#print(all_movies_name)

## Actual movies names
movies_names = pd.Series([events[i]["event-name"] for i in range(len(events)) if mask[i]])

event_codes = [events[i]["event-code"] for i in range(len(events)) if mask[i]]
print(movies_names)
##print(event_codes)
Exemple #14
0
 def test_astype_str(self, data):
     result = pd.Series(data[:5]).astype(str)
     expected_dtype = SparseDtype(str, str(data.fill_value))
     expected = pd.Series([str(x) for x in data[:5]], dtype=expected_dtype)
     self.assert_series_equal(result, expected)
def core(x, y, curve_type='二次曲线'):
    '''
    x: pd.Series
    y: pd.Series
    curve_types = ['二次曲线', '三次曲线', '对数曲线', '指数曲线', '复合曲线', '增长曲线', 'S型曲线']
    '''

    df = x.to_frame().join(y)
    df = sm.add_constant(df, prepend=True)
    df = df.rename(columns={'const': '常数项'})

    if curve_type == '二次曲线':
        formula = '%s ~ %s + %s + %s**2' % (y.name, '常数项', x.name, x.name)

    if curve_type == '三次曲线':
        formula = '%s ~ %s + %s + %s**2 + %s**3' % (y.name, '常数项', x.name,
                                                    x.name, x.name)

    if curve_type == '对数曲线':
        formula = '%s ~ %s + np.log(%s)' % (y.name, '常数项', x.name)

    if curve_type == '指数曲线':
        formula = 'np.log(%s) ~ np.log(%s) + %s' % (y.name, '常数项', x.name)

    if curve_type == '复合曲线':
        formula = 'np.log(%s) ~ np.log(%s) + np.log(%s)*%s' % (y.name, '常数项',
                                                               '常数项', x.name)

    if curve_type == '增长曲线':
        formula = 'np.log(%s) ~ %s + %s' % (y.name, '常数项', x.name)

    if curve_type == 'S型曲线':

        def _r(x):
            return 1 / x

        formula = 'np.log(%s) ~ %s + _r(%s)' % (y.name, '常数项', x.name)

    res = ols(formula, df).fit()
    tables = res.summary().tables
    df_list = [pd.read_html(StringIO(t.as_html()))[0] for t in tables]

    td = {
        'R平方': res.rsquared,
        '调整R方': res.rsquared_adj,
        '标准误': res.mse_model,
        'AIC': res.aic,
        'BIC': res.aic,
        '有效样本': res.nobs
    }
    tb1 = pd.Series(td).to_frame(name='模型汇总').T

    tb2 = df_list[1].set_index(0).iloc[1:].loc[['Intercept', x.name]]
    tb2.columns = ['回归系数', '标准误差SE', 'Z值', 'p值', '95%CI(下限)', '95%CI(上限)']
    tb2 = tb2.rename(index={'Intercept': '常数'})

    tb3 = pd.Series({
        '自由度': res.df_model,
        'F-值': res.fvalue,
        'p-值': res.f_pvalue
    }).to_frame(name='回归总体ANOVA')

    s4 = pd.Series(res.predict(), name=str(y.name) + '-拟合')
    if curve_type in ['指数曲线', '复合曲线', '增长曲线', 'S型曲线']:
        s4 = np.exp(s4)

    tb4 = y.to_frame().join(s4)

    return {'模型汇总': tb1, '回归系数汇总表': tb2, 'ANOVA表格': tb3, '实际值与拟合值': tb4}
Exemple #16
0
__package__ = None

_logger = logging.getLogger(__name__)

index = pa.Index([datetime(1991, 12, 31),
                  datetime(1992, 1, 1),
                  datetime(1992, 1, 2),
                  datetime(1992, 1, 3),
                  datetime(1992, 1, 6),
                  datetime(1992, 1, 7),
                  datetime(1992, 1, 8),
                  datetime(1992, 1, 9),
                  datetime(1992, 1, 10),
                  datetime(1992, 1, 13)])

data = pa.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], index=index, dtype=float)
datanode = mdf.datanode("data", data)

df = pa.DataFrame({"A" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                   "B" : [np.nan, 9, 8, np.nan, 6, 5, 4, 3, 2, 1]},
                        index=index, dtype=float)
dfnode = mdf.datanode("df", df)

filter = pa.Series([False, True, False, True, False,
                    True, False, True, False, True], index=index)
filternode = mdf.datanode("filter", filter)

queue_expected = [data[data.index <= d].tolist() for d in index]
queue_filtered_expected = [data[filter][data[filter].index <= d].tolist() for d in index]

delay_expected = data.shift(1)
agent_spec = {
    "type": "a2c",
    "learning_rate": 0.0003,
    "discount": 1.0,
    "estimate_terminal": False,
    "max_episode_timesteps": 20000,#理解成在update之前至少多少个timestep
    "network": network_spec,
    "batch_size": 100,
    "update_frequency":100
}

environment = TradingEnvironment(exchange=exchange,
                                 action_strategy=action_strategy,
                                 reward_strategy=reward_strategy,
                                 feature_pipeline=feature_pipeline)

strategy = TensorforceTradingStrategy(environment = environment, 
                                      agent_spec = agent_spec, save_best_agent = False)
#%%Start Over
performance = strategy.run(episodes=100, evaluation=False)
r = pd.Series(strategy._runner.episode_rewards)
#manually store agent
#strategy.save_agent(directory = 'save/', filename = '01')

#%% Restore and Continue 
'''
strategy.restore_agent(directory = 'save/', filename = 'best-model')
performance = strategy.run(episodes=(strategy._runner.agent.episodes + 20), evaluation=False)
'''
with open('poetry.txt','r', encoding='UTF-8') as f:
    raw_text = f.read()
lines = raw_text.split("\n")[:-1]
poem_text = [i.split(':')[1] for i in lines]
char_list = [re.findall('[\x80-\xff]{3}|[\w\W]', s) for s in poem_text]


# In[3]:


# 汉字 <-> 数字 映射

all_words = []
for i in char_list:
    all_words.extend(i)
word_dataframe = pd.DataFrame(pd.Series(all_words).value_counts())
word_dataframe['id'] = list(range(1,len(word_dataframe)+1))

word_index_dict = word_dataframe['id'].to_dict()
index_dict = {}
for k in word_index_dict:
    index_dict.update({word_index_dict[k]:k})
    
len(all_words), len(word_dataframe), len(index_dict)


# In[4]:


# 生成训练数据, x 为 前两个汉字, y 为 接下来的汉字 
# 如: 明月几时有 会被整理成下面三条数据
                             args=[df_PDO.loc[0]['TrainIsTrue']],
                             result_type='broadcast')

        # Butter Lowpass
        dates = df_PDOsplit.index
        freqraw = (dates[1] - dates[0]).days
        ls = ['solid', 'dotted', 'dashdot', 'dashed']
        fig, ax = plt.subplots(1,1, figsize=(10,5))
        list_dfPDO = [df_PDOsplit]
        lowpass_yrs = [.25, .5, 1.0, 2.0]
        for i, yr in enumerate(lowpass_yrs):
            window = int(yr*functions_pp.get_oneyr(dates).size) # 2 year
            if i ==0:
                ax.plot_date(dates, df_PDOsplit.values, label=f'Raw ({freqraw} day means)',
                          alpha=.3, linestyle='solid', marker=None)
            df_PDObw = pd.Series(filters.lowpass(df_PDOsplit, period=window).squeeze(),
                                 index=dates, name=f'PDO{yr}bw')
            ax.plot_date(dates, df_PDObw, label=f'Butterworth {yr}-year low-pass',
                    color='red',linestyle=ls[i], linewidth=1, marker=None)
            df_PDOrm = df_PDOsplit.rolling(window=window, closed='right', min_periods=window).mean()
            df_PDOrm = df_PDOrm.rename({'PDO':f'PDO{yr}rm'}, axis=1)
            ax.plot_date(dates, df_PDOrm,
                         label=f'Rolling mean {yr}-year low-pass (closed right)', color='green',linestyle=ls[i],
                         linewidth=1, marker=None)
            list_dfPDO.append(df_PDObw) ; list_dfPDO.append(df_PDOrm)
            ax.legend()

        filepath = os.path.join(path_out_main, 'Low-pass_filter.pdf')
        plt.savefig(filepath, bbox_inches='tight')
        df_PDOs = pd.concat(list_dfPDO,axis=1)

    functions_pp.store_hdf_df({'df_data':df_PDOs},
Exemple #20
0
##历史数据
# ##个人成交最低价
dta= [10000, 11600, 14000, 18700, 23900, 10000,10700, 11500, 11600, 12300
       , 13100, 14500, 16000, 14800,10300, 12000, 13200, 14500, 16800, 20200, 26000, 35000,
       10000, 13500, 16000, 19000, 24600, 21000, 19500, 21100, 23800, 25000, 23400, 10100, 15100, 18100, 19500, 18600
       , 15300, 15100, 15900, 17300, 18900, 21500, 23900, 25000, 26800, 28800, 30000, 30700, 18000
       , 21000, 22800, 25300, 32100]
##个人平均成交价
# dta = [22822, 14138, 11067, 15743, 20631, 26599, 30802, 16384, 12777
#        , 14074, 15391, 17024, 18310, 16654, 12382,13137, 14331, 15436, 17798, 21506, 27672
#        , 37805, 36231, 16886, 17487, 20609, 27077, 25727, 21884, 23315, 25701, 27127, 28541, 24324, 17330, 19614,
#        21551, 22300, 20591, 17508, 17419, 18358, 20127, 22996, 25498, 26668, 28561, 30535, 32449, 34046, 32312,
#        25213, 24560, 26939, 34455]

dta=np.array(dta,dtype=np.float)
dta=pd.Series(dta)
dta.index = pd.Index(sm.tsa.datetools.dates_from_range('2001','2055'))
dta.plot(figsize=(12,8))
fig = plt.figure(figsize=(12,8))
ax1= fig.add_subplot(111)
##差分运算
diff1 = dta.diff(1)
diff1.plot(ax=ax1)
fig = plt.figure(figsize=(12,8))
ax2= fig.add_subplot(111)
diff2 = dta.diff(2)
diff2.plot(ax=ax2)
##
diff1= dta.diff(1)
fig = plt.figure(figsize=(12,8))
ax1=fig.add_subplot(211)