Python from_dictの例、pandas.from_dict Pythonの例

コード例 #1

0

ファイルを表示

ファイル: GBDT_predict.py プロジェクト: robinzixuan/GSUhacks-gymapp

def run_demo():  
    #df=pd.read_sql('SELECT * FROM GAMEAPP;', con= db)
    (var,tp,timestart,timeend)=open()
    gym= time_list(var,tp,timestart,timeend)
    a= input('type:')
    a= a.lower()
    anly_dict=dict()
    for k  in  gym:
        anly_dict[k]=gym[k][a]
    data= pd.from_dict(anly_dict, orient='columns', dtype=None)
    data_other,data=cross_validation.train_test_split(data,test_size=0.001,random_state=10)#为了减少代码运行时间，方便测试  
    train_and_valid, test = cross_validation.train_test_split(data, test_size=0.2, random_state=10)  
    train, valid = cross_validation.train_test_split(train_and_valid, test_size=0.01, random_state=10)  
    train_feature, train_target = get_features_target(train)  
    test_feature, test_target = get_features_target(test)  
    valid_feature, valid_target = get_features_target(valid)  
  
    params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,  
              'learning_rate': 0.01, 'loss': 'ls'}  
    clf = ensemble.GradientBoostingRegressor(**params)  
  
    clf.fit(train_feature, train_target) #训练  
    # mse = mean_squared_error(test_target, clf.predict(test_feature)) #预测并且计算MSE  
    # print(mse)  
    pre=clf.predict(test_feature)  
    pre_list=list(pre)  
    real_pre_zip=zip(test_target,pre_list)  
  
    count=len(pre_list)  
    error=rmspe(real_pre_zip,count)  
    print(error)

コード例 #2

0

ファイルを表示

ファイル: Data_eda.py プロジェクト: sherrellm/SP_predict

def create_SP_500_member_df():
	'''Creates the data frame that will be used for modeling with the column SP_500_member ==1 to when the stock is a member'''
	quarter_order, quarter_membership_lists = generate_sp_membership_list()
	df = load_quarterly()
	SP_500_member = np.zeros((df.shape[0],1))
	df = df.reset_index(drop=True)

	

	for i, row in df.iterrows():
		if row['quarter'].ordinal in quarter_order:
			if row['ticker'] in quarter_membership_lists[quarter_order.index(row['quarter'].ordinal)]:
				SP_500_member[i] = 1 
	df['SP_500_member'] = SP_500_member

	items_added=0
	rows_added_dict = {'quarter':{},'ticker':{},'SP_500_member':{}}
	for quarter in df.quarter.unique():
		for ticker in df.ticker.unique():
				if ticker not in  df[df.quarter == quarter].ticker.values:
					rows_added_dict['quarter'].update({items_added:quarter.ordinal})
					rows_added_dict['ticker'].update({items_added:ticker})
					rows_added_dict['SP_500_member'].update({items_added:0})
					items_added+=1

	df = pd.concat([df,pd.from_dict(rows_added_dict)])





	return df

コード例 #3

0

ファイルを表示

ファイル: layer.py プロジェクト: ajduberstein/slayer

    def __init__(
        self,
        data,
        time_field=None,
        min_time=None,
        max_time=None,
        pickable=True,
        opacity=1,
        title='',
        js_function_overrides={}
    ):
        super(Layer, self).__init__()
        if not isinstance(data, pd.DataFrame):
            data = pd.from_dict(data, orient='records')
        self.data = data
        class_name = self.__class__.__name__
        # Layer name for deck.gl
        self.layer_type = class_name if 'Layer' in self.__class__.__name__ else class_name + 'Layer'
        self.js_function_overrides = js_function_overrides
        self.title = ''
        self.pickable = 'true' if pickable else 'false'
        self.id = '"%s"' % uuid.uuid4()

        if time_field is not None:
            try:
                times = data[time_field]
            except KeyError:
                raise Exception("Data does not have a time field named `%s`" % time_field)
            self.update_triggers = "{getColor: [timeFilter], getElevationValue: [timeFilter]}"
        self.time_field = time_field
        self.min_time = min(times) if time_field else None
        self.max_time = max(times) if time_field else None

        self.opacity = float(opacity)

コード例 #4

0

ファイルを表示

 def export_df(self):
     dickey_fullers = {}
     date_index = []
     for k, v in self.regressions.items():
         dickey_fullers[k] = v['dickey_fuller']
         date_index.append(v['date_range'][0])
     out_df = pd.from_dict(dickey_fuller)
     out_df.index = date_index
     return (out_df)

コード例 #5

0

ファイルを表示

def infer(request):
    form = ModelForm(None)
    if request.method == "POST":
        form = ModelForm(request.POST)
        model_data = pd.from_dict(form.cleaned_data)
        mdl = pickle.load(r'..\models\model.sav')
        pred = mdl.predict(model_data)
        context = {"form": form}
        print(pred)
    return render(request, 'py2tableau/infer_model_data.html', context)

コード例 #6

0

ファイルを表示

def calculate_centroid(df_raw, pdg_acc, cwd):
    # Calculate the centroid for each cluster
    subprocess.run('cat ' + pdg_acc +
                   '.reference_target.SNP_distances.tsv | cut -f1,5,9,12 > ' +
                   pdg_acc + '_selected_distance.tsv',
                   shell=True,
                   check=True)
    distance_file = pdg_acc + '_selected_distance.tsv'
    df_distance = pd.read_csv(distance_file, header=0, sep='\t')
    cluster_list = df_distance.groupby('PDS_acc').size().index.tolist()
    cluster_center_dict = {'PDS_acc': [], 'target_acc': []}
    # TODO: a custom folder name for the skesa assemblies maybe
    if os.path.exists(os.path.join(cwd, 'fasta')):
        pass
    else:
        os.mkdir(os.path.join(cwd, 'fasta'))
    for cluster in cluster_list:
        # get all isolates in this cluster
        df_cluster = df_distance.loc[df_distance['PDS_acc'] == cluster]
        # append the dataset with switched columns to get a "full" pairwise distance matrix
        df_append = df_cluster.append(
            df_cluster.rename(columns={
                "target_acc_1": "target_acc_2",
                "target_acc_2": "target_acc_1"
            }))
        # add up all distances for one target and try downloading the skesa assembly for the one with minimum distances
        for target in df_append.groupby(
                'target_acc_1')['delta_positions_unambiguous'].sum(
                ).sort_values().index.tolist():
            SRR = str(
                df_raw.loc[df_raw['target_acc'] == target]['Run'].iloc[0])
            if SRR == 'nan':
                continue
            try:
                # try downloading the skesa assembly, if failed turn to next genome in this cluster
                subprocess.check_call(
                    "dump-ref-fasta http://sra-download.ncbi.nlm.nih.gov/srapub_files/"
                    + SRR + "_" + SRR + ".realign > fasta/" + SRR +
                    "_skeasa.fasta",
                    shell=True,
                    stderr=subprocess.DEVNULL)
                cluster_center_dict['PDS_acc'].append(cluster)
                cluster_center_dict['target_acc'].append(target)
                break
            except subprocess.CalledProcessError:
                continue
    # delete all the empty fasta files
    os.system('find . -size 0 -delete')
    df_cluster_center = pd.from_dict(cluster_center_dict)
    df_cluster_center.to_csv(pdg_acc + '_PDS_center.csv')

    return df_cluster_center

コード例 #7

0

ファイルを表示

 def segment_frame(
     self,
     data=None,
     seg_size=24 * 7,
     seg_strafe=24 * 1,
     fields='all',
     window='hamming',
 ):
     '''
     Uses the function segment_dict with the same parameters.
     It simply first converts the data into a dictionary runs the function
     segment_dict and converts the result back into a data frame.
     '''
     data = pd.from_dict(
         self.segment_dict(
             data.to_dict(),
             seg_size,
             seg_strafe,
             fields,
             window,
         ))
     return data

コード例 #8

0

ファイルを表示

	def recommend(self):
		self.find_nearest()
		for customer in self.customers:
			self.recommend(customer)
		return pd.from_dict(self.recommendations)

コード例 #9

0

ファイルを表示

def main():
    tqdm.pandas()

    jump_range = 21
    data_path = "data/"

    if not data_path[-1] == "/" and not data_path[-1] == "\\":
        data_path = data_path + "/"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    filenames = [
        "systems_populated.json", "listings.csv", "commodities.json",
        "stations.json"
    ]

    for name in filenames:
        if need_to_pull(data_path, name):
            pull_from_eddb(data_path, name)

    commodities = read_data(data_path, "commodities.json")
    prices = read_data(data_path, "listings.csv")
    systems = read_data(data_path, "systems_populated.json")
    stations = read_data(data_path, "stations.json")

    #print("Finding the Tritium Commodity ID")
    #for commodity in commodities:
    #  if commodity["name"] == "Tritium":
    #    tritium_id = commodity["id"]
    #    break

    filtered_trades_stations = {}

    result = None
    print(
        "Trimming stations to only those that we can land at and have markets")
    for station in stations:
        if station["has_market"] and station[
                "max_landing_pad_size"] == "L" and not station[
                    "type"] == "Fleet Carrier":
            station_id = station["id"]
            system_id = station["system_id"]

            if station_id == 3:
                print(station)
            filtered_trades_stations[station_id] = {
                "station_id": station_id,
                "system_id": system_id,
                "station_name": station["name"]
            }

    systems_df = pandas.from_dict(systems)
    systems_df = systems_df[["id", "name", "x", "y", "z"]]

    filtered_stations_df = pandas.from_dict(filtered_trades_stations,
                                            orient="index")

    print("Joining systems and stations")
    filtered_stations_systems = filtered_stations_df.merge(systems_df,
                                                           how='left',
                                                           left_on='system_id',
                                                           right_on='id')
    filtered_stations_systems.drop("id", inplace=True, axis=1)

    print("Joining stations and prices")
    prices = prices[[
        "station_id", "commodity_id", "supply", "buy_price", "sell_price",
        "demand"
    ]]
    prices = prices.merge(filtered_stations_systems,
                          how='right',
                          left_on='station_id',
                          right_on='station_id',
                          suffixes=("_prices", "_systems")).apply(lambda x: x)
    prices.dropna(how='any', inplace=True)

    prices = prices.astype({
        "station_id": "uint32",
        "commodity_id": "uint16",
        "supply": "uint32",
        "buy_price": "uint64",
        "sell_price": "uint64",
        "demand": "uint32",
        "system_id": "uint64",
        "x": "float32",
        "y": "float32",
        "z": "float32"
    })

    for commodity in tqdm(commodities, desc='Commodities'):
        commodity_id = commodity['id']

        print("Filtering prices")
        stations_prices = prices[
            (prices['station_id'].isin(filtered_trades_stations.keys()))
            & (prices['commodity_id'] == commodity_id)]

        station_prices_buy = stations_prices[
            (stations_prices['buy_price'] > 0)
            & (stations_prices['supply'] > 5000)]
        station_prices_sell = stations_prices[
            (stations_prices['sell_price'] > 0)
            & (stations_prices['demand'] > 0)]

        if not station_prices_buy.empty and not station_prices_sell.empty:
            print("Joining buy and sell stations")
            print(station_prices_buy.info())
            print(station_prices_sell.info())

            #station_prices_diff = station_prices_buy.merge(station_prices_sell,how='left',on='join_key',suffixes=('_buy','_sell'))
            station_prices_diff = cartesian_product_multi(
                *[station_prices_buy, station_prices_sell])

            #station_prices_diff.columns = stations_prices.columns
            print(station_prices_diff.info())
            print(station_prices_diff)
            print("Getting profit for each route")
            station_prices_diff['profit'] = station_prices_diff[
                'sell_price_sell'] - station_prices_diff['buy_price_buy']

            print("Getting distance of each route")
            station_prices_diff[
                'route_distance'] = station_prices_diff.progress_apply(
                    lambda row: get_distance(row["x_buy"], row["y_buy"], row[
                        "z_buy"], row["x_sell"], row["y_sell"], row["z_sell"]),
                    axis=1)

            print("Getting jumps per route")
            station_prices_diff['route_jumps'] = np.ceil(
                station_prices_diff['route_distance'] / jump_range)

            print("Getting profit/jump")

            station_prices_diff[station_prices_diff['route_jumps'] == 0] = 1
            station_prices_diff['profit_per_jump'] = station_prices_diff[
                'profit'] / station_prices_diff['route_jumps']
            station_prices_diff.sort_values(by='profit_per_jump',
                                            ascending=False,
                                            inplace=True)

            station_prices_diff['commodity_id'] = commodity_id

            if result is None:
                result = station_prices_diff
            else:
                result = result.append(station_prices_diff)

    result.sort_values(by='profit_per_jump', ascending=False, inplace=True)

    commodities_df = pandas.DataFrame.from_dict(commodities)
    result = result.merge(commodities_df,
                          how='left',
                          left_on='commodity_id',
                          right_on='id',
                          suffixes=('_result', '_commodity'))

    pandas.set_option('display.max_columns', None)
    print(result[[
        'commodity_name', 'station_name_buy', 'name_buy', 'station_name_sell',
        'name_sell', 'profit_per_jump', 'buy_price_buy', 'sell_price_sell',
        'supply_buy', 'demand_sell'
    ]].head(10))

コード例 #10

0

ファイルを表示

ファイル: gridSearch.py プロジェクト: arnaudstiegler/Kaggle_WSDM_Churn_Prediction

cols = [
    c for c in train.columns if c not in [
        'is_churn', 'msno', 'gender', 'bd', 'transaction_date',
        'membership_expire_date'
    ]
]

from sklearn.model_selection import GridSearchCV
param_test1 = {'max_depth': range(10, 100, 10)}
gsearch1 = GridSearchCV(estimator=xgb.XGBClassifier(
    learning_rate=0.02,
    n_estimators=140,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=20,
    scale_pos_weight=1,
    seed=27),
                        param_grid=param_test1,
                        scoring='neg_log_loss',
                        n_jobs=20,
                        iid=False,
                        cv=5)
gsearch1.fit(train[cols], train['is_churn'])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

sortie = pd.from_dict(gsearch1.best_params_)
sortie.to_csv('bestParam.csv')

コード例 #11

0

ファイルを表示

ファイル: model2.py プロジェクト: ovshake/recurrency-network

	C.append(10e6 * i)
	C.append(10e7 * i)
	gamma.append(10e-1 * i)
	gamma.append(10e-2 * i)
	gamma.append(10e-3 * i)
	gamma.append(10e-4 * i)
	gamma.append(10e1 * i)
	gamma.append(10e2 * i)
	gamma.append(10e3 * i)
	gamma.append(10e4 * i)
	gamma.append(10e5 * i)
	gamma.append(10e6 * i)
	
parameters = {'C' : C, 'gamma' : gamma}


svc = SVC(kernel='rbf')

clf = GridSearchCV(svc, parameters, cv=5 , verbose = 1, scoring = make_scorer(accuracy_score))

clf.fit(X_train, y_train)

results = clf.cv_results_ 
df = pd.from_dict(results) 
df.to_csv("GridSearchResults.csv") 
print (clf.get_params())

y_pred = clf.predict(X_test)

print (np.sum(y_pred == y_test) / y_pred.shape[0])

コード例 #12

0

ファイルを表示

ファイル: _03_GraphProcessingEdgeReduction.py プロジェクト: runyu/B8_Social_Analytics_Project

def graph_reduction(f_from, f_to):
    # read all nodes and edges
    f = open(f_from, 'r')
    spl = f.readlines()

    edge_weight = []
    u = []
    v = []

    # only take corr < 0.7
    for i in range(len(spl)):
        (src, dst, correl) = (spl[i].rstrip()).split('\t')
        if float(correl) < limit:
            band = int(float(correl) / step + 1)
            edge_weight.append((src, dst, str(band)))
        if src not in u: u.append(src)
        if dst not in v: v.append(dst)

    wg = nx.Graph()
    wg.add_weighted_edges_from(edge_weight)

    edges = wg.edges(data='weight')

    nodes_list = pd.DataFrame({'id': [], 'group': []})

    for source, target, weight in edges:
        if source not in nodes_list['id'].values:
            nodes_list = nodes_list.append({
                'id': source,
                'group': weight
            },
                                           ignore_index=True)
        if target not in nodes_list['id'].values:
            nodes_list = nodes_list.append({
                'id': target,
                'group': weight
            },
                                           ignore_index=True)
    """ create cluster.json """
    nodes_str = ""
    count = 0
    for i, r in nodes_list.iterrows():
        nodes_str = nodes_str + "{\"id\":\"" + r['id'] + "\", \"score\":" + r[
            'group'] + "},"
        count += 1
    nodes_str = nodes_str.rstrip(',')

    links_str = ""
    for i, j, k in edges:
        source_index = np.where(nodes_list['id'] == i)
        target_index = np.where(nodes_list['id'] == j)
        links_str = links_str + "{\"source\":" + str(
            source_index[0][0]) + ", \"target\":" + str(
                target_index[0][0]) + "},"
    links_str = links_str.rstrip(',')

    json_str = "{\"nodes\":[" + nodes_str + "], \"links\":[" + links_str + "]}"
    parsed = json.loads(json_str)
    json.dump(parsed, open(f_to, 'w'), indent=4)

    dg = nx.degree_centrality(wg)
    degree_centrality = pd.from_dict(dg, orient='columns')

    #http://bl.ocks.org/AMDS/4a61497182b8fcb05906
    f_ticker_profile = '../data/tickers_prof.tsv'
    tickers_prof = pd.read_csv(f_ticker_profile, sep='\t', index_col=0)

    for key in degree_centrality:
        tickers_prof['degree'] = np.where(tickers_prof['ticker'] == key,
                                          degree_centrality[key])

コード例 #13

0

ファイルを表示

ファイル: main.py プロジェクト: DylanBartels/DataVisualisation

 def statistical_test(self):
     df = pd.from_dict(self.vec_negative)
     print(df.head())

コード例 #14

0

ファイルを表示


def next(intercept, mean, std):
    new_mean = mean + np.random.normal(0, 0.1, 1)[0]
    new_intercept = intercept + np.random.normal(0, 0.1, 1)[0]
    new_std = max(0, std + np.random.normal(0, 0.1, 1)[0])
    # print('new:')
    # print(new_mean, new_intercept, new_std)
    log_a = probability(new_mean, new_intercept, new_std) - probability(
        mean, intercept, std)
    if log_a >= 0:
        mean, intercept, std = new_mean, new_intercept, new_std
    else:
        u = np.random.uniform(0, 1, 1)
        if math.log(u) <= log_a:
            mean, intercept, std = new_mean, new_intercept, new_std
    print(mean, intercept, std)
    return round(mean, 1), round(intercept, 1), round(std, 1)


mean, intercept, std = 0, 0, 1

dist_mean = defaultdict(int)
dist_std = defaultdict(int)
for i in range(200):
    mean, intercept, std = next(mean, intercept, std)
    dist_mean[round(mean, 1)] += 1
    print(dist_mean)

out_df = pandas.from_dict(dist_mean)

コード例 #15

0

ファイルを表示

# .....

# Extract the code from your webapp response
# code = request.get('code') # or whatever your framework does
# access_token = client.exchange_code_for_token(client_id=22120,
# client_secret='<client_secret>', code=code)

client.access_token = access_token

if (os.path.isfile('raw_strava_data--------------.csv')):
    strava_data = pd.read_csv('raw_strava_data.csv')
    strava_data['start_date_local'] = pd.to_datetime(
        strava_data['start_date_local'])
    activities = client.get_activities(strava_data['start_date_local'].max())
    for activity in activities:
        strava_data.append(pd.from_dict(activity.to_dict()), ignore_index=True)
    stava_data.to_csv('raw_strava_data.csv')
else:
    activities = client.get_activities()
    types = ['time', 'latlng', 'altitude', 'heartrate', 'temp']
    headers_written = False
    #stream_types = ['time', 'latlng', 'altitude', 'heartrate', 'temp']
    stream_types = ['heartrate']
    with open('raw_strava_data.csv', 'w') as f:
        for activity in activities:
            streams = client.get_activity_streams(activity.id,
                                                  types=stream_types,
                                                  resolution='medium')
            temp = activity.to_dict()
            for k in types:
                if k in streams:

コード例 #16

0

ファイルを表示

ファイル: parse_component_names.py プロジェクト: mnichol3/my_jgcri

            ]

            for idx, val in enumerate(curr_vals):
                if (val in component_defs.keys()
                        and len(component_defs[val]) == 1):
                    curr_vals[idx] = component_defs[val][0]

            component_defs[curr_key] = curr_vals

for key, val in component_defs.items():
    # print(key, val)
    val_str = ''.join(component_defs[key])

    component_defs[key] = val_str

    print('{} --> {}'.format(key, val_str))

# Read the dictionary of macro defs into a Pandas DataFrame & write it to csv
col_names = ['Macro', 'Definition']
df = pd.from_dict(component_defs,
                  orient='columns',
                  dtype=str,
                  columns=col_names)

out_path = r"C:\Users\nich980\Documents\Docs\Hector\coding-notes"
f_name_out = "component_names.csv"

out_abs = join(out_path, f_name_out)

df.to_csv(out_abs, sep=',', header=True, index=False)