def main(): business_data = pd.DataFrame.from_csv('processed_data/business_data.csv', index_col=None) checkin_data = pd.DataFrame.from_csv('processed_data/checkin_data.csv', index_col=None) tip_data = pd.DataFrame.from_csv('processed_data/tip_data.csv', encoding="latin_1", index_col=None) review_data = pd.DataFrame.from_csv('processed_data/review_data.csv', encoding="latin_1", index_col=None) training_data = pd.DataFrame.from_csv('data/train_labels.csv', index_col=None) # Convert training date to seconds for easier maths training_data.ix[:, 'date'] = training_data.ix[:, 'date'].apply(functions.date_to_seconds).astype('int32') # Figure out the names of the TFIDF feature columns feature_re = re.compile(r'^[tr]\.') tip_features = [col for col in tip_data.columns if feature_re.match(col) is not None] review_features = [col for col in review_data.columns if feature_re.match(col) is not None] # Add restaurant IDs to everything id_dict = functions.build_restaurant_id_map('data/restaurant_ids_to_yelp_ids.csv') map_to_boston_ids = lambda yid: id_dict[yid] if yid in id_dict else np.nan for df in [business_data, checkin_data, tip_data, review_data]: df['restaurant_id'] = df['business_id'].map(map_to_boston_ids) df.drop(['business_id'], axis=1, inplace=True) # drop those businesses that are not in the Boston dataset df.dropna(axis=0, subset=['restaurant_id'], inplace=True) # FIXME Do something other than average the values across matching restaurant IDs? # This works because the only non-numeric fields are restaurant_id and business_id. # restaurant_id becomes the new index after grouping, so it is also not a normal column. business_data = business_data.groupby('restaurant_id', sort=False).mean() checkin_data = checkin_data.groupby('restaurant_id', sort=False).mean() # Sum the TFIDF features for everything that comes before in time for a given tip or review # FIXME maybe don't drop userid? tip_data.reset_index(inplace=True) tip_data.drop(['user_id', 'text'], axis=1, inplace=True) review_data.reset_index(inplace=True) review_data.drop(['text'], axis=1, inplace=True) # Finally, join everything training_data = create_evaluation_data(business_data, checkin_data, tip_data, review_data, tip_features, review_features, training_data) training_data.to_csv("processed_data/training_data.csv", index=None) # And join the submission data submission_data_1 = pd.from_csv('data/SubmissionFormat.csv', index_col=None) submission_data_1.ix[:, 'date'] = submission_data_1.ix[:, 'date'].apply(functions.date_to_seconds).astype('int32') submission_data_1 = create_evaluation_data(business_data, checkin_data, tip_data, review_data, tip_features, review_features, submission_data_1) submission_data_1.to_csv("processed_data/submission_data.csv", index=None) submission_data_2 = pd.from_csv('data/PhaseIISubmissionFormat.csv', index_col=None) submission_data_2.ix[:, 'date'] = submission_data_2.ix[:, 'date'].apply(functions.date_to_seconds).astype('int32') submission_data_2 = create_evaluation_data(business_data, checkin_data, tip_data, review_data, tip_features, review_features, submission_data_2) submission_data_2.to_csv("processed_data/phase2_data.csv", index=None) return
def read_dataframe_from_csv(fname): """Read dataframe from csv file Read dataframe from csv file Args: fname(str): Csv filename """ return pd.from_csv(fname)
def dataset_from_csv(self, filename, time_column='point_in_time'): """ Read dataset from csv file filename : str filename to use return DataFrame """ return pd.from_csv(filename, parse_dates=[time_column])
def load_data(self): """ Load the data from the ``.tsv`` files in the ``self.df_file`` dictionary. The optional *keys* parameter is a list of types of counts to be loaded. By default, all counts are loaded. """ if keys is None: keys = self.df_file.keys() else: if not all(key in self.df_file.keys() for key in keys): raise EnrichError("Cannot load unsaved counts", self.name) for key in keys: self.counts[key] = pd.from_csv(self.df_file[key], sep="\t")
def get_review_exp(owner, project_name, commit_id, author): pr = get_pull_request_for_commit(owner, project_name, commit_id) rcount=0 pr_exp = [] last_max_pr = 20000 pr_exp_df_old= None if len(pr)!=0: number = int(pr['number']) # check if the project already exist if f'{project_name}.csv' in os.listdir('exp_data'): # check if the last max pr was higher than the current one pritn('file exists') pr_exp_df_old = pd.from_csv(open(f'exp_data/{project_name}.csv')) last_pr = pr_exp_df_old.pr.max() if number <=last_pr: # return the last exp rcount = pr_exp_df_old[pr_exp_df_old.pr<=number].exp.values[0] return rcount # else set the end point to the last max pr else: last_max_pr = pr_exp_df_old.pr.values[0] rcount = pr_exp_df_old.exp.values[0] else: print('starting a new project') while(number>last_max_pr): rnames = get_reviewer_names(number) if author in rnames: rcount+=1 pr_exp.append({'pr':number, 'exp':rcount}) number-=1 print(f'review count {rcount}') pr_exp_df = pd.DataFrame(pr_exp) pr_exp_df.exp = pr_exp_df.exp.astype('int') pr_exp_df.pr = pr_exp_df.pr.astype('int') pr_exp_df.exp = rcount - pr_exp_df.exp if pr_exp_df_old!=None: pr_exp_df = pd.concat([pr_exp_df_old, pr_exp_df]).sort_values(by='pr', ascending=False) pr_exp_df.reset_index(drop=True, inplace=True) pr_exp_df.to_csv(f'exp_data/{project_name}.csv') return rcount else: return rcount
def from_lpdata(name): filename = '%s_lpdata.txt' % name df = pd.from_csv(filename)
def create_files(n=n, copula_string='independence-copula', path=path,df=pd.DataFrame.from_csv(data_file),dimkeys=dimkeys, index=index, kind=kind, marginal_string=marginal_string, method=method, segment_marginal=segment_marginal): """ This function creates csv files that correspond to dataframe we use. It creates U sample and P sample and return a column of S sample (cf documentation). :param n: size of the sample :param copula_string: The kind of copula you want to compute :param datatype : string whether it is 'actuals' or 'errors' :param index_diag : index of the diagonal where you want to project :return: Vector of the S value (cf latex for the meaning of U, O, P, Q,R, S """ l = len(df.index) if method == 'wholeyear': dt_list = df.index elif method=='daytoday': dt_list = df.index[100:l-1] dimension = len(dimkeys) dict_epis_parameters = dict.fromkeys(dimkeys) if marginal_string == 'univariate-epispline': for i in dimkeys: if os.path.isfile(path + '/epispline_parameters_'+str(i)+'_' + marginal_data + '.csv'): dict_epis_parameters[i] = pd.DataFrame.from_csv(path + '/epispline_parameters_'+str(i)+'_' + marginal_data + '.csv') else: dict_epis_parameters[i] = pd.DataFrame(None, index=dt_list, columns=['alpha', 'beta', 'N', 'w0', 'u0', 'delta']) diago = diag(dimension) suffix = return_suffix(kind, index, dimkeys, copula_string,segment_marginal) P_file_exists = os.path.isfile(path + '/samples/P_' + suffix) if P_file_exists: df_P = pd.DataFrame.from_csv(path + '/samples/P_' + suffix) else: df_U = dict.fromkeys(dimkeys) for i in dimkeys: U_file_exists = os.path.isfile(path + '/samples/U_' + copula_string) if U_file_exists: df_U[i] = pd.from_csv(path + '/samples/U_' + copula_string +'_'+str(i)+ '.csv') else: df_U[i] = pd.DataFrame(None, index=dt_list, columns=range(n)) df_P = pd.DataFrame(None, index=dt_list, columns=range(n)) S = [] if method == 'wholeyear': input = dict.fromkeys(dimkeys) for i in dimkeys: input[i] = df[i].values.tolist() marginals = dict.fromkeys(dimkeys) for i in dimkeys: marginals[i] = marginal_from_input(marginal_string, input[i], None, method, dict_epis_parameters[i]) distr_class = distribution_factory(copula_string) mydistr = distr_class(dimkeys, input) for mydt in dt_list: #print(mydt) if method == 'daytoday': input = dict.fromkeys(dimkeys) for i in dimkeys: input[i] = df[i].loc[df.index < mydt].values.tolist() if segment_marginal == 'segmented': input_segmented = dict.fromkeys(dimkeys) for i in dimkeys: segmented_df = segmenter.OuterSegmenter(df.loc[df.index < mydt], df, 'copula_experiments/segment_input_wind_FH'+str(i)+'.txt', mydt).retval_dataframe() input_segmented[i] = segmented_df[i].values.tolist() input_marginals= input_segmented else: input_marginals=input marginals = dict.fromkeys(dimkeys) for i in dimkeys: marginals[i] = marginal_from_input(marginal_string, input_marginals[i], mydt, method, dict_epis_parameters[i]) distr_class = distribution_factory(copula_string) mydistr = distr_class(dimkeys, input) if P_file_exists: P = df_P.loc[mydt].values else: if U_file_exists: U = np.transpose(df_U.loc[mydt].values) else: U = mydistr.generates_U(n) for j in range(dimension): df_U[dimkeys[j]].loc[mydt] = U[:, j] P = diago.proj_scalar(U, index, kind) df_P.loc[mydt] = P O = df.loc[mydt].loc[dimkeys].values Q = np.zeros(dimension) for i in range(dimension): Q[i] = marginals[dimkeys[i]].cdf(O[i]) R = diago.proj_scalar(Q, index, kind, mydistr) counter = sum(1 for i in range(n) if P[i] <= R) S.append(counter / n) if marginal_string == 'univariate-epispline': for i in dimkeys: dict_epis_parameters[i].to_csv(path + '/epispline_parameters_'+str(i)+'_' + marginal_data + '.csv') if not P_file_exists: df_P.to_csv(path + '/samples/P_' + suffix) for i in dimkeys: df_U[i].to_csv(path + '/samples/U_' + copula_string +'_'+str(i)+ '.csv') return S
def stock_price(symbol): file = 'data/{}'.format(symbol.upper()) if not os.path.exists(file): pass return pd.from_csv(file)
import pandas as pd import numpy as np import matplotlib.pyplot as plt import string from nltk.corpus import stopwords from collections import Counter import os import io # This pulls in the cleaned file, removed nulls etc. final_set = pd.from_csv( '/Users/jacobtryba/DSI/assignments/capstone2/data/cleaned_data.csv') # This initiates a function to remove punctuation, and is then used by the three text columns # to be used in the analysis - description, title, genre punct_to_remove = string.punctuation def remove_punctuation(text): return text.translate(str.maketrans('', '', punct_to_remove)) final_set['description'] = final_set['description'].apply( lambda text: remove_punctuation(text)) final_set['title'] = final_set['title'].apply( lambda text: remove_punctuation(text)) final_set['genre'] = final_set['genre'].apply( lambda text: remove_punctuation(text)) # This initiates a function to remove stop words and is then used on the description column stop_words = set(stopwords.words('english'))
import pandas as pd df = pd.from_csv('build.csv') print df
def load_fit_manif2table(unit_list, netname, dataroot, ang_step=9, save=True, load=False, GANname="", savestr=""): """Load experiments into table, Algorithmic version Esp. it load evolution information into the tab. load: if true, it will load saved stats table instead of computing a new one. """ if load: nettab = pd.from_csv( join(dataroot, "summary", '%s_ManifExpFitSum%s.csv' % (netname, savestr))) return nettab theta_arr = np.arange(-90, 90.1, ang_step) / 180 * np.pi phi_arr = np.arange(-90, 90.1, ang_step) / 180 * np.pi stat_col = [] for unit in unit_list[:]: layer = unit[1] layerdir = "%s_%s_manifold-%s" % (netname, layer, GANname) RFfit = unit[-1] suffix = "rf_fit" if RFfit else "original" npyfns = glob(join(dataroot, layerdir, "*.npy")) if len(unit) == 6: pattern = re.compile("Manifold_score_%s_(\d*)_%d_%d_%s.npy" % (layer, unit[3], unit[4], suffix)) else: pattern = re.compile("Manifold_score_%s_(\d*)_%s.npy" % (layer, suffix)) matchpatt = [pattern.findall(fn) for fn in npyfns] iChlist = [int(mat[0]) for mat in matchpatt if len(mat) == 1] fnlist = [fn for mat, fn in zip(matchpatt, npyfns) if len(mat) == 1] print("Found %d units in %s - %s layer!" % (len(iChlist), netname, layer)) for iCh in iChlist: # range if len(unit) == 6: unit_lab = "%s_%d_%d_%d" % (layer, iCh, unit[3], unit[4]) elif len(unit) == 4: unit_lab = "%s_%d" % ( layer, iCh, ) else: raise NotImplementedError explabel = "%s_%s" % (unit_lab, suffix) data = np.load( join(dataroot, layerdir, "Manifold_score_%s.npy" % (explabel))) Mdata = np.load( join(dataroot, layerdir, "Manifold_set_%s.npz" % (explabel))) # final generation activation from Evolution gens = Mdata["evol_gen"] finalscores = Mdata["evol_score"][gens == gens.max()] initscores = Mdata["evol_score"][gens == (gens.min() + 1)] tval, pval = ttest_1samp(finalscores, initscores.mean()) for spi in range(data.shape[0]): # all spaces unitstat = EasyDict() if len(unit) == 6: unitstat.pos = (unit[3], unit[4]) elif len(unit) == 4: unitstat.pos = None actmap = data[spi, :, :] # PC2-3 space param, param_std, _, R2 = fit_Kent_Stats(theta_arr=theta_arr, phi_arr=phi_arr, act_map=actmap) unitstat.netname = netname unitstat.layer = layer unitstat.iCh = iCh unitstat.explabel = explabel unitstat.space = spi unitstat.RFfit = RFfit unitstat.imgsize = Mdata["imgsize"] unitstat.corner = Mdata["corner"] # Maximal activation from Manifold, unitstat.actmax = actmap.max() unitstat.actmin = actmap.min() unitstat.evolfinact = finalscores.mean() unitstat.evolttest = tval unitstat.evolttest_p = pval # Fitting stats unitstat.R2 = R2 for i, pnm in enumerate(param_names): unitstat[pnm] = param[i] unitstat[pnm + "_std"] = param_std[i] # Append to collection stat_col.append(unitstat) nettab = pd.DataFrame(stat_col) if save: os.makedirs(join(dataroot, "summary"), exist_ok=True) nettab.to_csv( join(dataroot, "summary", '%s_ManifExpFitSum%s.csv' % (netname, savestr))) return nettab
if args['dataset'] == 'CIFAR': model = CIFAR_Model() _, valid_loader, _, _ = create_loaders(is_train=False, is_valid=True, is_test=False) else: model = MNIST_Model() _, valid_loader, _, _ = create_loaders(which_dataset='MNIST', is_train=False, is_valid=True, is_test=False) mask = create_mask(model) model.load_state_dict(torch.load('environment\\model.pth')) mask.load_state_dict(torch.load('environment\\mask.pth')) criterion, _ = create_criterion_optimizer(model) valid_loss = .0 model.eval() with torch.no_grad(): for images, labels in valid_loader: apply_mask(model, mask) outputs = model(images) valid_loss += criterion(outputs, labels).item() valid_loss /= len(valid_loader) df = pd.from_csv('wrapping\\pruned_layers.csv', sep=',') df[-1, -1] = valid_loss df.to_csv('wrapping\\pruned_layers.csv', sep=',')
import pandas as pd df = pd.from_csv('EUR_USD_2000/EUR_USD_SECOND_QUARTER_2000.csv') rows_list = [] for row in input_rows: dict1 = {} # get input row in dictionary format # key = col_name dict1.update() rows_list.append(dict1) df = pd.DataFrame(rows_list)
# In[8]: df # In[7]: df['tokens_clean'] = df['tokens_nostop'].apply(lambda x: [" ".join(x)]) #df['tokens_clean'] = df['tokens_nostop'].apply(lambda x: [str(x)]) #" ".join([str(item) for var in data for item in var]) # In[ ]: df.to_csv('cleaned_review_data.csv') #df = pandas.read_csv('../data/cleaned_review_data.csv') df = pandas.from_csv('../data/cleaned_review_data.csv', encoding='utf-8') # In[13]: import csv df = csv.reader('../data/cleaned_review_data.csv') # In[ ]: df2 = pandas.DataFrame() # r.to_csv("data.csv") # grouped.to_csv("nodupes_alllyrics_tokenized.csv") df2.from_csv("../data/cleaned_review_data.csv") df2 # In[77]:
resRows = rows - window + 1 res = DataFrame(columns=range(window*cols)) for idx in range(resRows): if window-1 > 0: newRow = c(inp[idx,], t(inp[(idx+1):(idx+window-1),])) elif: newRow = inp[idx,] if idx %% 10000 == 0: print(idx) res[idx,] = t(newRow) return(res) # Read training data into memory iotRaw = pd.from_csv('resources/normal_20170202_2229.csv') # Select training columns iot = iotRaw[["LinAccX..g.","LinAccY..g.","LinAccZ..g."]] # Set training window and ngram window = 200 iot = ngram(iot, window) # Send the data to H2O iot_hex = h2o.H2OFrame(iot) # Run the deeplearning model in autoencoder mode neurons = 50 iot_dl = h2o.estimators.deeplearning.H2OAutoEncoderEstimator(model_id = "iot_dl", autoencoder = TRUE,
def load_facebook(): return pd.from_csv("facebook/normalized-facebook-insights-post-level-data.csv")
def file_selection_from_csv(filename): return ESGFFileSelection(pd.from_csv(filename))
def readTweets(): return list(pd.from_csv('data/tweets.csv'))