def spec_scrape(folder, save=False): """ Runs through a .spc file (located in folder with associated .edf files) and extracts load, position, and slit size information. """ spec_file = sorted([x for x in os.listdir(folder) if x[-4:] == ".spc"]) error = "Either zero or multiple .spc files have been found." assert len(spec_file) == 1, error spec_file = spec_file[0] scan = spec_file[:-4] data_store = [] with open(os.path.join(folder, spec_file), "r") as f: lines = [line.rstrip("\n") for line in f][1:] search = r"-?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?" for idx, line in enumerate(lines): if scan in line: x, y = [float(i) for i in re.findall(search, lines[idx + 11])[2:4]] slit_x = [float(i) for i in re.findall(search, lines[idx + 12])][7] slit_y = [float(i) for i in re.findall(search, lines[idx + 13])][1] scan_num = [float(i) for i in re.findall(search, lines[idx])][-1] load = [float(i) for i in re.findall(search, lines[idx + 23])][-3] data_store.append([int(scan_num), load, x, y, slit_x, slit_y]) df = pd.DataFrame( data_store, columns=("Scan Number", "Load (kN)", "x (mm)", "y (mm)", "slit_x (mm)", "slit_y (mm)") ) if save: pd.to_pickle(df, os.path.join(folder, "%s.pkl" % scan)) return df
def main(data_path, rng): allfiles = listdir(data_path) # Just in case. allfiles = sorted(allfiles) if rng: low, high = rng.split("-") low = int(low) high = int(high) howmany = high - low else: l = len(allfiles) low, high = 0, l howmany = high - low n = 0 print "Preprocessing features, files in the range %s"%(rng) print "Progress:" # Read each driver trips for i in range(low, high): driver = allfiles[i] #output = open("data/stats/%s"%(driver), "w") trips = read_driver_trips(data_path, driver) #headers = trips[0].keys() #output.write(','.join(headers) + '\n') df = pd.DataFrame(trips) outputname = driver.split('.')[0] pd.to_pickle(df, "data/stats/%s.pkl"%(outputname)) if n%10 == 0: print "%d more to go"%(howmany-n) n += 1
def clean_dictionaries(dictionary=impact_factors, filename="impactfactors.pickle"): import pickle dictionary = {} # with open('impactfactors.pickle', 'wb') as handle: # pickle.dump(dictionary, handle) pd.to_pickle(dictionary, file)
def get_subjects_list_adults_fct(df_path, df_qc_path, subjects_list): ''' excludes kids and subjects with missing sex or age ''' import pandas as pd import numpy as np df = pd.read_pickle(df_path) df_qc = pd.read_pickle(df_qc_path) df = pd.merge(df, df_qc, left_index=True, right_index=True) pd.to_pickle(df, 'testdf.pkl') df['subject_id'] = df.subject_id_x # fixme exclude subjects with mean_FD>.1 subjects_list_exclude = df[(df.age<18) | (df.mean_FD_Power>.1)].index subjects_list_adults = subjects_list for exclude_subject in subjects_list_exclude: if exclude_subject in subjects_list_adults: subjects_list_adults.remove(exclude_subject) missing_info = df[(df.age==999) | ((np.logical_or('M','F'))==False)].index for missing in missing_info: if missing in subjects_list_adults: subjects_list_adults.remove(missing) # remove subject from subject_list_adults for which no entry exists in df for subject in subjects_list_adults: if not(subject in df.index): subjects_list_adults.remove(subject) return subjects_list_adults
def save_task_table(self, file_path=None, task_table=None): if not file_path: file_path = os.path.join(self.data_dir, 'task_table.pkl') if not task_table: task_table = self.get_task_table() pandas.to_pickle(task_table, file_path)
def main(): #load input data for xgboost xgbInput = fetch_data.clfInput() xgbInput.sessions_ftrEng() xgbInput.users_ftrEng() xgbInput.one_hot() xgbInput.split_data() param = {'num_class': 12, 'objective': 'multi:softprob', 'seed': 0} param['eta'] = 0.20 param['max_depth'] = 6 param['subsample'] = .5 param['col_sample_bytree'] = .6 results = {} cv_train = pd.DataFrame() cv_valid = pd.DataFrame() nrounds = 40 for train_indx, valid_indx in cv_bymonth(xgbInput): dtrain = xgb.DMatrix(xgbInput.train_X[train_indx], label = xgbInput.train_Y[train_indx], missing = -1) dvalid = xgb.DMatrix(xgbInput.train_X[valid_indx], label = xgbInput.train_Y[valid_indx], missing = -1) evallist = [(dtrain, 'train'), (dvalid, 'eval')] bst = xgb.train(param, dtrain, nrounds, evallist, feval = calc_ndcg.evalerror, evals_result = results) cv_train = pd.concat([cv_train, pd.Series(results['train']['error'])], axis = 1) cv_valid = pd.concat([cv_valid, pd.Series(results['eval']['error'])], axis = 1) pd.to_pickle(cv_train, 'cv_results/sessions_e20_25n/tr_err_av.p') pd.to_pickle(cv_valid, 'cv_results/sessions_e20_25n/val_err_av.p') full_cv(xgbInput.train_X, xgbInput.train_Y, 'fulltr_err_av.p', param, nrounds)
def pickle_trialDataSource(): ''' trialDataSource is converted to a dataframe: gazeEventsDF, and then pickled return: void ''' global gazeEventsDF trialEventsDF = pd.DataFrame() for key, source in trialSourceDict.items(): if key != "index": eventDF = source.to_df() eventDF['eventType'] = key trialEventsDF = pd.concat([eventDF, trialEventsDF], axis=0) if gazeEventsDF is False: pd.to_pickle(trialEventsDF, eventPickleLoc) else: # Remove old records from current trial from gazeEventsDF gazeEventsDF = gazeEventsDF[gazeEventsDF['trialNum'] != trialNum] # Add new data gazeEventsDF = pd.concat([gazeEventsDF, trialEventsDF], axis=0) pd.to_pickle(gazeEventsDF, eventPickleLoc)
def test_min_ms_cvar_avgsp(n_stock, win_length, alpha, scenario_cnt=1): """ :param n_stock: range(5, 55) :param win_length: range(50, 250) :param alphas: list :return: """ t_start_date, t_end_date = date(2005, 1, 3), date(2014, 12, 31) symbols = EXP_SYMBOLS[:n_stock] # read rois panel roi_path = os.path.join( os.path.abspath(os.path.curdir), '..', 'data', 'pkl', 'TAIEX_2005_largest50cap_panel.pkl') # shape: (n_period, n_stock, {'simple_roi', 'close_price'}) roi_panel = pd.read_pickle(roi_path) # shape: (n_period, n_stock) risk_rois = roi_panel.loc[t_start_date:t_end_date, symbols, 'simple_roi'].T n_period = len(risk_rois.index) n_scenario = 200 risk_free_roi = np.zeros(n_period, dtype=np.float) allocated_risk_wealth = np.zeros(n_stock, dtype=np.float) allocated_risk_free_wealth = 1e6 buy_trans_fee = 0.001425 sell_trans_fee = 0.004425 # read scenario scenario_name = "{}_{}_m{}_w{}_s{}_{}_{}.pkl".format( START_DATE.strftime("%Y%m%d"), END_DATE.strftime("%Y%m%d"), len(symbols), win_length, n_scenario, "unbiased", scenario_cnt) scenario_path = os.path.join(EXP_SP_PORTFOLIO_DIR, 'scenarios', scenario_name) scenario_panel = pd.read_pickle(scenario_path) predict_risk_rois = scenario_panel.loc[t_start_date:t_end_date] predict_risk_free_rois = np.zeros((n_period, n_scenario)) # model t0 = time() res = min_ms_cvar_avgsp_portfolio(symbols, risk_rois.index, risk_rois.as_matrix(), risk_free_roi, allocated_risk_wealth, allocated_risk_free_wealth, buy_trans_fee, sell_trans_fee, alpha, predict_risk_rois.as_matrix(), predict_risk_free_rois, n_scenario, solver="cplex", verbose=False) print res pd.to_pickle(res, os.path.join(TMP_DIR, 'min_ms_cvar_avgsp.pkl')) print predict_risk_rois.mean(axis=2) print "all_scenarios_min_cvar_avgsp_portfolio: " print "(n_period, n_stock, n_scenarios):({}, {}, {}): {:.4f} secs".format( n_period, n_stock, 200, time() - t0 )
def conv2pkl(name): elist = read_network("network.dat") coms = read_community("community.dat") elist_path = os.path.join("binary_networks/data", name+"_edge.pkl") coms_path = os.path.join("binary_networks/data", name+"_label.pkl") pd.to_pickle(elist, elist_path) pd.to_pickle(coms, coms_path)
def convert(split_size=max_timestep): m = pd.read_csv('har_dataset/train/X_train.txt', header=None).as_matrix() x_train = HarData.split_time(m, split_size) m = pd.read_csv('har_dataset/train/y_train.txt', header=None).as_matrix() y_train = HarData.split_time(tflearn.data_utils.to_categorical(m - 1, HarData.output_size), split_size) m = pd.read_csv('har_dataset/test/X_test.txt', header=None).as_matrix() x_test = HarData.split_time(m, split_size) m = pd.read_csv('har_dataset/test/y_test.txt', header=None).as_matrix() y_test = HarData.split_time(tflearn.data_utils.to_categorical(m - 1, HarData.output_size), split_size) pd.to_pickle([(x_train, y_train), (x_test, y_test)], 'har_data.pkl')
def ComputeGlobaldf(s,nu): from popgen.SFselect import metaSVM ;sys.modules['metaSVM']=metaSVM svm=pd.read_pickle('/home/arya/sfselect/SVMs/general_SVM_sp.pck') dfs={'AF':getGlobaldf(s,'AF',svm,removeFixedSites=True), 'HAF':getGlobaldf(s,'HAF',svm,removeFixedSites=True), 'tajimaD':getGlobaldf(s,'tajimaD',svm,removeFixedSites=True), 'H':getGlobaldf(s,'H',svm,removeFixedSites=True), 'SFSelect':getGlobaldf(s,'SFSelect',svm,removeFixedSites=True) } pd.to_pickle(dfs, path+'nu{}.s{}.df'.format(nu,s))
def retry_write_pickle(data, file_path, retry_cnt=10): for retry in xrange(retry_cnt): try: pd.to_pickle(data, file_path) except (IOError, EOFError) as e: if retry == retry_cnt -1: raise Exception(e) else: print ("dispatch:writing retry: {}, {}".format( retry+1, e)) time.sleep(np.random.rand() * 10)
def create_graph(g_name,lis,dd,d,pickle=False): g_name=nx.Graph() for node in lis: g_name.add_node(node) g_name.add_weighted_edges_from(dd) g_name.add_edges_from(d) if pickle: pd.to_pickle(g_name,g_name+".pkl") return g_name
def test_round_trip_current(self): for typ, dv in for dt, expected in dv.items(): with tm.ensure_clean(self.path) as path: pd.to_pickle(expected,path) result = pd.read_pickle(path) self.compare_element(typ, result, expected)
def merge(): APs,MRRs=[],[] for i in range(200): try: res=pd.read_pickle(outpath+'numFB{}i{}.pkl'.format(numFeedbacks,i)) if res[0]: APs.append(res[0]) MRRs.append(res[1]) except: pass pd.to_pickle( {'AP':{'mean':np.mean(APs),'std':np.std(APs)}, 'MRR':{'mean':np.mean(MRRs),'std':np.std(MRRs)}},'/home/arya/PubMed/prefFB{}.pkl'.format(numFeedbacks))
def split_data(infile, train, test, attrfile, na_strategy, trainpct, split_randomly): expanded_data = strip_and_process_na(pd.read_pickle(infile), attrfile, na_strategy) train_example_count = int(len(expanded_data.index) * trainpct / 100.0) if split_randomly: train_indices = np.random.choice(expanded_data.index, size=train_example_count) else: train_indices = expanded_data.sort("Date").index[:train_example_count] train_data = expanded_data.ix[train_indices] test_data = expanded_data.drop(train_indices) pd.to_pickle(train_data, train) pd.to_pickle(test_data, test)
def save_meta(meta, meta_fn='meta.pandas.pickl'): """Save metadata associated with a project. Parameters ---------- meta : pd.DataFrame The DataFrame of metadata meta_fn : str The filename """ backup(meta_fn) pd.to_pickle(meta, meta_fn)
def generate_reference(data, file_base): """ Takes a results data frame and returns an experiment dictionary with the columsn and column types for each experiment (after apply post_processing) :data: the data dataframe of a expfactory Result object :file_base: """ exp_dic = {} for exp_id in numpy.unique(data['experiment_exp_id']): exp_dic[exp_id] = {} df = extract_experiment(data,exp_id, clean = False) col_types = df.dtypes exp_dic[exp_id] = col_types pandas.to_pickle(exp_dic, file_base + '.pkl')
def run(self): with open(self.input().path) as f: f = f.readlines() array=[] for i in range(len(f)): array.append(apache2_logrow(f[i])) df = pd.DataFrame(array) df.columns = ['Host','Log_Name1','Log_Name','Date_Time','Method','Response_Code','Bytes_Sent','URL','User_Agent'] pd.to_pickle(df,self.output().path)
def dump_mlb_data(outfile, start_date=None, end_date=None, max_count=None, use_random=False, datatype='batting'): """ Dump MLB statistical data to a file. :param str outfile: name of file to become pickled pandas datafile :param str start_date: don't include games from before this date when dumping data :param str end_date: don't include games from after this date when dumping data :param int max_count: maximum # of rows to dump :param bool use_random: whether to select rows at random (if False, choose most recent) :return: """ print 'Dump MLB data for', datatype print 'loading data...' all_bsbr_logs = load_gamelogs(datatype=datatype) unindexed_dfs = [] print 'reindexing data...' pbar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.ETA()]) for player_id, dataframe in pbar(all_bsbr_logs.items()): uidf = dataframe.reset_index() # Add player ID as a column to the dataframe for future joining purposes! uidf['player_id'] = pandas.Series(data=player_id, index=uidf.index) unindexed_dfs.append(uidf) all_game_rows = pandas.concat(unindexed_dfs, ignore_index=True) # Filter by date if start_date is not None: all_game_rows = all_game_rows[all_game_rows['Date'] > start_date] if end_date is not None: all_game_rows = all_game_rows[all_game_rows['Date'] < end_date] # Don't use relief pitchers in our dataset if datatype == 'pitching': print 'restricting to starting pitchers only...' all_game_rows = all_game_rows[all_game_rows['player_id'].apply(brefid_is_starting_pitcher)] # Sample filtered data if max_count and max_count < len(all_game_rows): print 'sampling %d rows...' % max_count if use_random: kept_indices = random.sample(all_game_rows.index, max_count) selected = all_game_rows.iloc[kept_indices] else: all_game_rows.sort("Date") selected = all_game_rows.tail(max_count) else: selected = all_game_rows print 'saving...' pandas.to_pickle(selected, outfile) print 'Done!' return selected
def aggregate(self): file_data_frame = self._get_file_data_frame(self.input_data_dir) partial_data_frames = self._get_partial_data_frames(file_data_frame) observer_id = partial_data_frames.observer_id surface = partial_data_frames.surface result_series = partial_data_frames.file_series.apply(self._process_partial_file_series) result_data_frame = pandas.DataFrame({'observer_id': observer_id, 'surface': surface, 'oscillogram': result_series}) pandas.to_pickle(result_data_frame, os.path.join(self.aggregated_data_dir, 'aggregated_data.pkl')) for path in file_data_frame.path: os.remove(path)
def read_all_clusterings_cache(drop_centers=True, recompute=False, categoricals=('dset', 'neuropil', 'distance', 'clusterer')): cache_file = op.join(CLUSTER_RUNS_CACHE_PATH, 'all#with_centers=%r.pkl' % (not drop_centers)) if recompute or not op.isfile(cache_file): dfs = [read_clusterings_cache(dset, neuropil, drop_centers=drop_centers, categoricals=()) for dset, neuropil in product(get_all_datasets(), get_all_neuropils())] dfs = [df for df in dfs if df is not None] df = pd.concat(dfs) for categorical in categoricals: df[categorical] = df[categorical].astype('category') pd.to_pickle(df, cache_file) try: return pd.read_pickle(cache_file) except: # quick and dirty account for old pandas versions return read_all_clusterings_cache(drop_centers=drop_centers, recompute=True,)
def data_loader_1(symbol_list): if os.path.exists('rets.pkl'): rets = pd.read_pickle('rets.pkl') else: rets = get_data(symbol_list) pd.to_pickle(rets, 'rets.pkl') ins, outs = sort_data(rets) ins = ins.transpose([0,2,1]).reshape([-1, len(symbol_list) * 100]) div = int(.8 * ins.shape[0]) train_ins, train_outs = ins[:div], outs[:div] test_ins, test_outs = ins[div:], outs[div:] #normalize inputs train_ins, test_ins = train_ins/np.std(ins), test_ins/np.std(ins) return train_ins, test_ins, train_outs, test_outs
def dump_nba_data(outfile, start_date=None, end_date=None, max_count=None, use_random=False): """ Dump NBA statistical data to a file. :param str outfile: name of file to become pickled pandas datafile :param str start_date: don't include games from before this date when dumping data :param str end_date: don't include games from after this date when dumping data :param int max_count: maximum # of rows to dump :param bool use_random: whether to select rows at random (if False, choose most recent) :return: """ if start_date: start_date = parser.parse(start_date) else: start_date = datetime.datetime(2010, 10, 1) if end_date: end_date = parser.parse(end_date) else: end_date = print 'Dump NBA data for %s to %s' % (start_date, end_date) print 'loading data...' all_game_rows = load_all_game_data() # Filter by date if start_date is not None: all_game_rows = all_game_rows[all_game_rows['date'] > start_date] if end_date is not None: all_game_rows = all_game_rows[all_game_rows['date'] < end_date] # Sample filtered data if max_count and max_count < len(all_game_rows): print 'sampling %d rows...' % max_count if use_random: # We seed to 0 when we call this from CLI to make sure that random splits are replicable. random.seed(0) kept_indices = random.sample(all_game_rows.index, max_count) selected = all_game_rows.loc[kept_indices] else: all_game_rows.sort("date") selected = all_game_rows.tail(max_count) else: selected = all_game_rows print 'saving...' pandas.to_pickle(selected, outfile) print 'Done!' return selected
def load_data(path): try: data_filename = get_filename(path, 'trackedobjects.pickle') print 'Found file: ', data_filename pd = pandas.read_pickle(data_filename) except: data_filename = get_filename(path, 'trackedobjects.hdf5') print 'Found file: ', data_filename data_filename_pickled = data_filename.split('.')[0] + '.pickle' try: pd = pandas.read_pickle(data_filename_pickled) except: pd = mta.read_hdf5_file_to_pandas.load_data_as_pandas_dataframe_from_hdf5_file(data_filename) pandas.to_pickle(pd, data_filename_pickled) pd = mta.read_hdf5_file_to_pandas.remove_rows_above_speed_threshold(pd, speed_threshold=2) return pd
def createData(s): T=mkv.Markov.computeTransition(s=s, N=1000, takeLog=True).astype(np.float128).apply(np.exp).apply(lambda x: x/x.sum(),axis=1); stable=pd.Series([T,T10,T100],index=[1,10,100]).apply(lambda x: x.applymap(np.log)) naive=pd.Series([mkv.Markov.computeTransition(s=s, N=1000, takeLog=True),mkv.computePowerSimulations(s=s,n=10,save=False),mkv.computePowerSimulations(s=s,n=10,save=False)],index=[1,10,100]) data={'naive':naive,'stable':stable} if s==0: pd.to_pickle(data,utl.outpath+'real/stablity.neutral.pkl') else: pd.to_pickle(data,utl.outpath+'real/stablity.selection.pkl')
def run_best_ms_simulation(n_stock ,verbose=False): """ The best multi-stage strategy, """ t0 = time() # read rois panel roi_path = os.path.join(SYMBOLS_PKL_DIR, 'TAIEX_2005_largest50cap_panel.pkl') if not os.path.exists(roi_path): raise ValueError("{} roi panel does not exist.".format(roi_path)) param = "{}_{}_m{}".format( START_DATE.strftime("%Y%m%d"), END_DATE.strftime("%Y%m%d"), n_stock) symbols = EXP_SYMBOLS[:n_stock] n_stock = len(symbols) # shape: (n_period, n_stock, {'simple_roi', 'close_price'}) roi_panel = pd.read_pickle(roi_path) # shape: (n_period, n_stock) exp_risk_rois = roi_panel.loc[START_DATE:END_DATE, symbols, 'simple_roi'].T n_exp_period = exp_risk_rois.shape[0] exp_risk_free_rois = pd.Series(np.zeros(n_exp_period), index=exp_risk_rois.index) allocated_risk_wealth = pd.Series(np.zeros(n_stock), index=symbols) initial_wealth = 1e6 instance = BestMSPortfolio(symbols, exp_risk_rois, exp_risk_free_rois, allocated_risk_wealth, initial_wealth, start_date=START_DATE, end_date=END_DATE) reports = file_name = 'best_ms_{}.pkl'.format(param) file_dir = os.path.join(EXP_SP_PORTFOLIO_DIR, 'best_ms') if not os.path.exists(file_dir): os.makedirs(file_dir) pd.to_pickle(reports, os.path.join(file_dir, file_name)) print ("best_ms {} OK, {:.3f} secs".format(param, time()-t0))
def export_files(out): to_pickle(out, 'recap_export.pkl') out_csv = [] for court, v in out.items(): for judge_name, data in v.items(): for title, years in data.items(): row = OrderedDict([ ('court', court), ('name', judge_name), ('title', title), ('total count', sum(years.values())) ]) for year, count in years.items(): row[str(year)] = count out_csv.append(row) df = pandas.DataFrame(out_csv) df = df[['court', 'name', 'title', 'total count'] + sorted( [x for x in df.columns if x.isdigit()])] df.to_csv('recap_export.csv', index=False)
def forwardSimulation(self,selectionOnRandomSite=False,siteUnderSelection=None,H0=None): """ returns np 3D array T x nSS x R which T=|{t_1,t_2,..}| (nnumber of times), nSS is number of SS , and R is the number of replicates """ if self.initialCarrierFreq==-1: selectionOnRandomSite=True if H0 is None: if self.H0 is None: H0=MSMS.Song(F=self.F, L=self.L, Ne=self.Ne, r=self.r,,uid=self.uidMSMS,msmsFile=self.msmsFile,dir=self.outpathmsms) else: H0=self.H010 if self.foldInitialAFs: idx=H0.mean(0)>0.5 H0.iloc[:,idx.values]=1-H0.iloc[:,idx.values] self.setH0(H0) self.positions_msms=self.H0.columns.values.copy(True) self.positions=sorted(np.random.choice(self.L,self.H0.shape[1],replace=False)) self.H0=pd.DataFrame(self.H0.values, columns=self.positions) self.X0=self.H0.mean(0).values if selectionOnRandomSite: self.set_siteUnderSelection(np.random.randint(0,self.H0.shape[1])) elif siteUnderSelection is not None: self.set_siteUnderSelection(siteUnderSelection) else: if not self.s: self.set_siteUnderSelection(self.X0.argmax()) else: sites=np.sort(np.where(self.X0== self.initialCarrierFreq)[0]); if not len(sites): sites=np.sort(np.where(( self.X0 <= self.initialCarrierFreq +0.025) & ( self.X0 >= self.initialCarrierFreq -0.025) ) [0]); if not len(sites): print 'Try again. No site at freq ',self.initialCarrierFreq, self.uid; return self.set_siteUnderSelection(sites[np.random.randint(0,len(sites))]) pop= self.createInitialDiploidPopulation() self.X=np.array([self.multiLocSelectionHardSweepOneReplicate(pop.clone()) for _ in range(self.numReplicates)]).swapaxes(0,2).swapaxes(0,1) #makes sure the site under selection does not go to zero if self.ignoreInitialNeutralGenerations: self.X=self.X[self.initialNeutralGenerations:,:,:] self.X=np.append(np.tile(self.X0[:,None],(1,self.X.shape[2]))[None,:,:],self.X,axis=0) if self.onlyKeep is not None: self.X=self.X[:,self.X0==self.onlyKeep,:] self.sampleDepths() if pd.to_pickle(self,self.outpath+self.uid+'.pkl')
def runAllForEachS(s): outpath=home+'out/vineet/'; if not os.path.exists(outpath): os.makedirs(outpath) fname='{}results{}'.format(outpath,float(s)) experimentIDX=range(numExperiments) np.random.shuffle(experimentIDX) print 'Running Experiments for s= {}'.format(s) sys.stdout=open('{}results.out'.format(outpath),'w') sys.stderr=open('{}results.err'.format(outpath),'w') for j in experimentIDX: print 's={} j={}'.format(s,j),sys.stdout.flush() if os.path.exists('{}_{}.pd'.format(fname,j)): continue param=getERParam(simulateData= True,numThreads=1 ,s=s, experimentNumber=j) df,param=runFindSForAllMethods(param) df.to_pickle('{}_{}.pd'.format(fname,j)) pd.to_pickle(param,'{}_{}.pd'.format(fname.replace('results','param'),j)) print s,j,param['initHaps'].shape,sys.stdout.flush() if ps.virtual_memory().percent>95: print >> sys.stderr,'s={} exited!'.format(s) exit()
if(image_count % 30 == 0): data = {'image': cate_list, 'image2': cate_list2, 'url': url_list[0:image_count]} frame = DataFrame(data) pd.to_pickle(frame, 'C:/Users/user/Desktop/url-image.df') def http_label_request(image_name): return urllib2.urlopen(server_url+image_name) def init_list(arg_df): global image_count for each in arg_df.iterrows(): cate_list.append(each[1]['image']) cate_list2.append(each[1]['image2']) train_df = pd.read_pickle("C:/Users/user/Desktop/url.df") reserved_df = pd.read_pickle("C:/Users/user/Desktop/url-image.df") url_list = train_df.url init_list(reserved_df) search() data = {'image': cate_list, 'image2': cate_list2, 'url': url_list} frame = DataFrame(data) pd.to_pickle(frame, 'C:/Users/user/Desktop/url-image.df')
"SVM_OVR: CV Accur", "SVM_OVR: Train Accur", "SVM_OVR: Test Accur", ]) for model_type in ['SVM_OVR', 'LR_OVR']: print("Running CV for %s" % model_type) for cat in range(0, 10): # Read the CV accuracies CV_accuracies = pd.read_pickle( ct.ROOT + "Pickles\\Fitted_Hierarchy_CV\\%s_%s_CV.p" % (cat, model_type)) best_C, best_CV_acc = fc.get_best_C(CV_accuracies) print("Best C and best CV accurancy:", best_C, best_CV_acc) summ_stats.ix[cat, "%s: Best C" % model_type] = best_C summ_stats.ix[cat, "%s: CV Accur" % model_type] = best_CV_acc train_acc, test_acc = fit_model_cat(model_type, best_C, cat) print("Train accur and test accur are:", train_acc, test_acc) summ_stats.ix[cat, "%s: Train Accur" % model_type] = train_acc summ_stats.ix[cat, "%s: Test Accur" % model_type] = test_acc pd.to_pickle(summ_stats, ct.ROOT + "Pickles\\Fitted_Hierarchy\\HC_Summ_Stats.p")
text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # Return a list of words return (text) clean_q1 = [] clean_q2 = [] for i in tqdm(np.arange(data_all.shape[0])): clean_q1.append(preprocessing(data_all[i][0])) clean_q2.append(preprocessing(data_all[i][1])) for i in tqdm(np.arange(data_all.shape[0])): clean_q1[i] = text_to_wordlist(clean_q1[i]) clean_q2[i] = text_to_wordlist(clean_q2[i]) train_clean = pd.DataFrame() test_clean = pd.DataFrame() train_clean['question1'] = clean_q1[:train.shape[0]] train_clean['question2'] = clean_q2[:train.shape[0]] test_clean['question1'] = clean_q1[train.shape[0]:] test_clean['question2'] = clean_q2[train.shape[0]:] pd.to_pickle(train_clean, '../X_v2/train_final_clean.pkl') pd.to_pickle(test_clean, '../X_v2/test_final_clean.pkl')
ft_model_path=ft_model_path) def evaluate(epoch=None): trainer, dev_data = prepare_model(training=False, test_code=False, load_weights=True, lr=0.5) em, f1 = trainer.evaluate_on_dev(dev_data, BATCH_SIZE // 3, -1) # can control huge data print('--Dev Extract Match score :%f--------F1 score:%f' % (em, f1)) if __name__ == '__main__': action = cfg.get('Action', 'ACTION') if action == 'train': # training prepare_model(training=True, test_code=False, load_weights=False, lr=LEARNING_RATE) elif action == 'finetune': assert '_ft' in MODEL_NAME, 'finetune action should append the model name with _ft' # finetune finetune(ft_model_path='./model/squad_' + MODEL_NAME.replace('_ft', '') + '.hdf5') elif action == 'predict': # predict pred = predict(text=False) pd.to_pickle(pred, './pred.pkl')
def config_create(main_model, sector_name, result_file_name, config_name, data, time_para, pot_in_num, leve_ratio_num, sp_in, ic_num, fit_ratio, n, use_factor_num): time_para_dict = dict() time_para_dict['time_para_4'] = [ pd.to_datetime('20140601'), pd.to_datetime('20180601'), pd.to_datetime('20180901') ] time_para_dict['time_para_5'] = [ pd.to_datetime('20140701'), pd.to_datetime('20180701'), pd.to_datetime('20180901') ] time_para_dict['time_para_6'] = [ pd.to_datetime('20140801'), pd.to_datetime('20180801'), pd.to_datetime('20180901') ] data_n = data[data['time_para'] == time_para] a_n = data_n[(data_n['ic'].abs() > ic_num) & (data_n['pot_in'].abs() > pot_in_num) & (data_n['leve_ratio'].abs() > leve_ratio_num) & (data_n['sp_in'].abs() > sp_in) & (data_n['fit_ratio'].abs() > fit_ratio)] a_n['pnl_file_name'] = a_n[['time_para', 'key', 'fun_name' ]].apply(lambda x: '|'.join(x.astype(str)), axis=1) print(a_n['con_out_2'].sum() / len(a_n), len(a_n)) a_n['buy_sell'] = (a_n['sp_m'] > 0).astype(int).replace(0, -1) use_factor_ratio = use_factor_num / len(a_n.index) pnl_save_path = '/mnt/mfs/dat_whs/data/mix_factor_pnl/' + result_file_name sum_pnl_df = pd.DataFrame() for i in a_n.index: pnl_file_name = a_n['pnl_file_name'].loc[i] print('***************************************************') print('now {}\'s is running, key={}'.format(i, pnl_file_name)) pnl_df = pd.read_pickle( os.path.join(pnl_save_path, '{}.csv'.format(pnl_file_name))) = pnl_file_name sum_pnl_df = pd.concat([sum_pnl_df, pnl_df], axis=1) # _________________________________________________________________________________ part_sum_pnl_df = sum_pnl_df.loc[:pd.to_datetime('20180601')] sharpe_df_after = part_sum_pnl_df.iloc[-100:].apply(bt.AZ_Sharpe_y) = 'sharpe_df_after' sharpe_df_before = part_sum_pnl_df.iloc[:-100].apply(bt.AZ_Sharpe_y) = 'sharpe_df_before' sharpe_df = part_sum_pnl_df.apply(bt.AZ_Sharpe_y) = 'sharpe_df' # info_df = pd.concat([sharpe_df_before, sharpe_df_after], axis=1) # _________________________________________________________________________________ target_df = (sum_pnl_df > 0).astype(int) kmeans = KMeans(n_clusters=n).fit(target_df.T) kmeans_result = kmeans.labels_ columns_list = target_df.columns group_df = pd.DataFrame(kmeans_result, index=columns_list) file_name_list = a_n['pnl_file_name'].values a_n['group_key'] = group_df.loc[file_name_list].values target_df = pd.DataFrame() for i in range(n): part_a_n = a_n[a_n['group_key'] == i].sort_values(by='sp_in') part_num = int(len(part_a_n) * use_factor_ratio) part_target_df = part_a_n[[ 'fun_name', 'name1', 'name2', 'name3', 'buy_sell' ]].iloc[:part_num] print(part_num) target_df = target_df.append(part_target_df) print(len(target_df)) print(Counter(target_df['name1'].values)) print(Counter(target_df['name2'].values)) print(Counter(target_df['name3'].values)) config_info = dict() config_info['factor_info'] = target_df config_info['sector_name'] = sector_name config_info['result_file_name'] = result_file_name config_info['if_weight'] = main_model.if_weight config_info['ic_weight'] = main_model.ic_weight config_info['hold_time'] = main_model.hold_time config_info['if_hedge'] = main_model.if_hedge config_info['if_only_long'] = main_model.if_only_long pd.to_pickle(config_info, '/mnt/mfs/dat_whs/alpha_data/{}.pkl'.format(config_name))
def trial(model_file_name, scenario, number_of_trials, rendering=False, graphs_suffix='', verbose=C_VERBOSE_NONE, store_history=False, compute_saliency=False, history_save_path='./output/history_test.pkl'): """ Summary: Evaluate the trained DQN for a number of trials (number_of_trials). Args: model_file_name: string The saved trained DQN (Keras DNN h5 file). scenario: string The OpenAI gym scenario to be loaded by the Emulator. number_of_trials: int How many trials to execute. rendering: boolean If True, OpenAI gym environment rendering is enabled. graphs_suffix: string A suffix added in the graphs file names. To be used in case of multiple trials. verbose: int Verbose level (0: None, 1: INFO, 2: DEBUG) store_history: bool Store history data or not. compute_saliency: bool Computes saliency or not. history_save_path: str Where to store the history file. Raises: - Returns: trials_average_reward: float The average reward for the trial-episode (100 episodes) notes: - """ if verbose > C_VERBOSE_NONE: print('\nEvaluate the trained DQN in ', str(number_of_trials), ' trials (episodes).', sep='') print('- model_file_name = ', model_file_name, ', scenario = ', scenario, ', number_of_trials = ', number_of_trials, ', rendering = ', rendering, ', graphs_suffix = ', graphs_suffix, sep='') # Create a Emulator object instance (without a seed) emulator = em.Emulator(scenario=scenario, average_reward_episodes=number_of_trials, statistics=True, rendering=rendering, seed=42, verbose=verbose) # Create a Deep Neural Network object instance and load the trained model (model_file_name) dnn = deepNeuralNetwork.DeepNeuralNetwork(file_name=model_file_name, verbose=verbose) # Start measuring Trials time start_time = time.time() history = { 'trial': [], 'state': [], 'action': [], 'reward': [], 'next_state': [], 'done': [], 'q_values': [] } if compute_saliency: history['saliency'] = [] # Trials # used as baseline for perturbation # for each feature, apply a random noise of 0.2 * (max(feature) - min(feature)) state_min = np.array([-0.354871, -0.10391249, -0.468456, -0.89336216, -0.15218297, -0.4017307, 0, 0]) state_max = np.array([-0.00462484, 1.4088593, 0.12988918, 0.05392841, 0.5564749, 0.8584606, 1, 1]) for i in range(number_of_trials): current_state = emulator.start() while emulator.emulator_started: q_values = dnn.predict(current_state) action = np.argmax(q_values) if compute_saliency: # compute saliency saliency = np.zeros(NUM_STATE) for _ in range(NUM_SALIENCY_TESTS): for j in range(NUM_STATE): # perturb state perturbed_state = np.array(current_state) if j < 6: # numerical states perturbed_state[j] = SALIENCY_PERTURBATION * np.random.rand() \ * (state_max[j] - state_min[j]) + state_min[j] else: # boolean states perturbed_state = current_state.copy() perturbed_state[j] = 1 - perturbed_state[j] q_values_preturbed = dnn.predict(perturbed_state) max_q = np.max(q_values) q_values /= max_q q_values_preturbed /= max_q q_value_dict = {a: q_values[0, a].astype(np.float64) for a in range(4)} q_value_preturbed_dict = {a: q_values_preturbed[0, a].astype(np.float64) for a in range(4)} saliency[j] = sarfa_saliency.computeSaliencyUsingSarfa(action, q_value_dict, q_value_preturbed_dict)[0] saliency /= NUM_SALIENCY_TESTS # Experience [s, a, r, s'] experience = emulator.applyAction(action) # save data if store_history: history['trial'].append(i) history['state'].append(current_state) history['action'].append(action) history['reward'].append(experience[2]) if experience[3] is not None: history['next_state'].append(experience[3]) history['done'].append(False) else: history['next_state'].append(current_state) history['done'].append(True) history['q_values'].append(q_values) if compute_saliency: history['saliency'].append(saliency) current_state = experience[3] if store_history: for k in history.keys(): history[k] = np.array(history[k]) history_save_dir = os.path.split(history_save_path)[0] if not os.path.exists(history_save_dir): os.makedirs(history_save_dir) pd.to_pickle(history, history_save_path) if verbose > C_VERBOSE_NONE: print('\nDQN ', str(number_of_trials), ' trials average = ', emulator.execution_statistics.values[-1, 3], ', in ', executionTimeToString(time.time() - start_time), sep='') return emulator.execution_statistics.values[-1, 3]
def train(scenario, average_reward_episodes, rendering, hidden_layers, hidden_layers_size, memory_size, minibatch_size, optimizer_learning_rate, gamma, epsilon_decay_factor, maximum_episodes, model_file_name, converge_criteria=None, graphs_suffix='', seed=None, verbose=C_VERBOSE_NONE, store_history=False, history_save_path='./output/history_train.pkl'): """ Summary: Trains a DQN model for solving the given OpenAI gym scenario. Args: scenario: string The OpenAI gym scenario to be solved. average_reward_episodes: int On how many concecutive episodes the averaged reward should be calculated. rendering: boolean If True, OpenAI gym environment rendering is enabled. hidden_layers: int The number of hidden layers of the Deep Neural Network. Not including the first and last layer. hidden_layers_size: int The size of each hidden layer of the Neural Network. memory_size: int The size of the replay memory feature which will be used by the DQN. minibatch_size: int The minibatch size which will be retrieved randomly from the memory in each iteration in the DQN. optimizer_learning_rate: float The Adam optimizer learning rate used in the DNN. gamma: float The discount factor to be used in the equation (3) of [1]. epsilon_decay_factor: float The decay factor of epsilon parameter, for each iteration step. maximum_episodes: int The maximum number of episodes to be executed. If DQN converges earlier the training stops. model_file_name: string The file in which the DQN trained model (DNN Keras) should be saved. converge_criteria: int or None The DQN converge criteria (when for converge_criteria concecutive episodes average reward is > 200, the DQN assumed that has been converged). If None, the training continues till the maximum_episodes is reached. graphs_suffix: string A suffix added in the graphs file names. To be used in case of multiple trains. seed: int Optional Seed to be used with the OpenAI gym environment, for results reproducability. verbose: int Verbose level (0: None, 1: INFO, 2: DEBUG) store_history: bool Store history or not. history_save_path: str Where to store the history file. Raises: - Returns: convergence_episode: int In which episode the DQN convergences convergence_time: string (time) On how much time the DQN convergences Rturns None if converge_criteria is None notes: - """ if verbose > C_VERBOSE_NONE: print('\nDQN Training Starts (scenario = ', scenario, ', average_reward_episodes = ', average_reward_episodes, ', rendering = ', rendering, ', hidden_layers = ', hidden_layers, ', hidden_layers_size = ', hidden_layers_size, ', memory_size = ', memory_size, ', minibatch_size = ', minibatch_size, ', optimizer_learning_rate = ', optimizer_learning_rate, ', gamma = ', gamma, ', epsilon_decay_factor = ', epsilon_decay_factor, ', maximum_episodes = ', maximum_episodes, ', model_file_name = ', model_file_name, ', converge_criteria = ', converge_criteria, ', graphs_suffix = ', graphs_suffix, ', seed = ', seed, ')', sep='') # If seed is given the apply it if seed is not None: applySeed(seed, verbose) # Create a Emulator object instance emulator = em.Emulator(scenario, average_reward_episodes, statistics=True, rendering=rendering, seed=seed, verbose=verbose) # Create a Deep Neural Network object instance (Keras with Tensor Flow backend) dnn = deepNeuralNetwork.DeepNeuralNetwork(inputs=emulator.state_size, outputs=emulator.actions_number, hidden_layers=hidden_layers, hidden_layers_size=hidden_layers_size, optimizer_learning_rate=optimizer_learning_rate, seed=seed, verbose=verbose) # Create a DQN object instance (we start always from epsilon = 1.0, we control each value with the # epsilon_decay_factor dqn = deepQNetwork.DeepQNetwork(emulator=emulator, dnn=dnn, states_size=emulator.state_size, actions_number=emulator.actions_number, memory_size=memory_size, minibatch_size=minibatch_size, gamma=gamma, epsilon=1.0, epsilon_decay_factor=epsilon_decay_factor, seed=seed, verbose=verbose) # Start measuring training time start_time = time.time() history = { 'trial': [], 'state': [], 'action': [], 'reward': [], 'next_state': [], 'done': [], 'q_values': [] } if converge_criteria is not None: # Holds how many concecutive episodes average reward is > 200 convergence_counter = 0 episodes_convergence_counter = [] # Holds the convergence_counter for all episodes convergence_episode = 0 # Training starts here for i in range(maximum_episodes): current_state = emulator.start() # See Algorithm 1 in [1] while emulator.emulator_started: q_values = dnn.predict(current_state) action = np.argmax(q_values) # Experience [s, a, r, s'] experience = emulator.applyAction(action) # save data if store_history: history['trial'].append(i) history['state'].append(current_state) history['action'].append(action) history['reward'].append(experience[2]) if experience[3] is not None: history['next_state'].append(experience[3]) history['done'].append(False) else: history['next_state'].append(current_state) history['done'].append(True) history['q_values'].append(q_values) dqn.storeTransition(experience) dqn.sampleRandomMinibatch() # s = s' at the end of the step, before starting the new step current_state = experience[3] if converge_criteria is not None: # Check if convergence counter should be increased or to be reset if emulator.average_reward > 200: convergence_counter += 1 else: convergence_counter = 0 episodes_convergence_counter.append(convergence_counter) if verbose > C_VERBOSE_NONE: print('Convergence Counter: ', convergence_counter, sep='') # DQN model assumed that it has been converged if convergence_counter >= converge_criteria: convergence_episode = i break if store_history: for k in history.keys(): history[k] = np.array(history[k]) history_save_dir = os.path.split(history_save_path)[0] if not os.path.exists(history_save_dir): os.makedirs(history_save_dir) pd.to_pickle(history, history_save_path) if converge_criteria is not None: convergence_time = time.time() - start_time if verbose > C_VERBOSE_NONE and converge_criteria is not None: print('\nDQN converged after ', convergence_episode, ' episodes in ', executionTimeToString(convergence_time), sep='') elif verbose > C_VERBOSE_NONE and converge_criteria is None: print('\nDQN trained for ', maximum_episodes, ' episodes in ', executionTimeToString(time.time() - start_time), sep='') # Create Graphs # 1. Steps per Episode plt.plot(emulator.execution_statistics.values[:, 0], emulator.execution_statistics.values[:, 1], color='coral', linestyle='-') plt.grid(b=True, which='major', axis='y', linestyle='--') plt.xlabel('Episode', fontsize=12) plt.ylabel('Steps', fontsize=12) plt.title('Steps per Episode', fontsize=12) plt.savefig('Steps_Per_Episode' + graphs_suffix + '.png') plt.clf() # 2. Total Reward per Training Episode plt.plot(emulator.execution_statistics.values[:, 0], emulator.execution_statistics.values[:, 2], color='coral', linestyle='-', label='Total Reward') plt.plot(emulator.execution_statistics.values[:, 0], emulator.execution_statistics.values[:, 3], color='midnightblue', linestyle='--', label='Episodes Reward Average') plt.grid(b=True, which='major', axis='y', linestyle='--') plt.xlabel('Episode', fontsize=12) plt.ylabel('Reward', fontsize=12) plt.title('Total Reward per Training Episode', fontsize=12) plt.legend(loc='lower right', fontsize=12) plt.savefig('Total_Reward_Per_Training_Episode' + graphs_suffix + '.png') plt.clf() # Save the trained model dnn.saveModel(model_file_name) if converge_criteria is not None: return convergence_episode
def kfold_run(self, X_train, y_train, X_test=None, y_test=None, model_params=None, n_folds=5, stratify=False, index_number=None, flow_augment=False, save_oof=False, ): """KFold/StratifiedKFold run. # Arguments X_train: (numpy array), training set. y_train: (numpy array), training set labels. X_test: (numpy array), test set. y_test: (numpy array), test set labels. model_params: (Dict), dictionary of model parameters. n_folds: (Int), number of folds used in training. stratify: (Boolean), whether fold split should be stratified according to labels distribution. index_number: (Int), index specifying from which bag should training or prediction be started. flow_augment: (Boolean), whether to use data augmentation during test and prediction. save_oof: (Boolean), whether to automatically save oof predictions. Assumes oof/train and oof/test folders in source directory. # Returns model: (Keras model), trained model for last fold. oof_train: (numpy array), array with out-of-fold training set predictions. if predict_test additionally: oof_test: (numpy array), array with out-of-fold test set predictions. """ if index_number is not None: self.i = index_number oof_index = 0 if len(y_train.shape) == 1: y_train = y_train.reshape((y_train.shape[0], 1)) self.oof_train = np.zeros(y_train.shape + (1,)) print('OOF train predictions shape: {}'.format(self.oof_train.shape)) if X_test is not None: self.oof_test = np.zeros( (X_test.shape[0],) + y_train.shape[1:] + (n_folds,)) print('OOF test predictions shape: {}'.format(self.oof_test.shape)) if stratify and self.oof_train.shape[-2] != 1: print( 'To use StratifiedKFold please provide categorically encoded labels, not One-Hot encoded. \ \n Reversing OH encoding now.') y_train_split = pd.DataFrame(y_train).idxmax(axis=1).values print('Labels after reversed encoding:', y_train_split[:10]) kf = StratifiedKFold( n_splits=n_folds, shuffle=self.shuffle, random_state=self.seed) else: kf = KFold( n_splits=n_folds, shuffle=self.shuffle, random_state=self.seed) y_train_split = y_train for train_index, test_index in kf.split(X_train, y_train_split): print('Training on fold:', self.i, '\n') X_tr, X_val = X_train[train_index], X_train[test_index] y_tr, y_val = y_train[train_index], y_train[test_index] model = self.model_name(model_params) if self.save_statistics: os.makedirs('{}{}'.format( self.checkpoints_dst, self.run_save_name), exist_ok=True) if self.save_model: self.callbacks_append_checkpoint('fold') if self.save_history: self.callbacks_append_logger('fold') if self.load_keras_model: model = self.load_trained_model('fold') else: if flow_augment: print('Training with data augmentation.') history = model.fit_generator( self.train_datagen.flow( X_tr, y_tr, batch_size=self.batch_size), steps_per_epoch=X_tr.shape[0] / self.batch_size, epochs=self.number_epochs, validation_data=self.valid_datagen.flow( X_val, y_val, batch_size=self.batch_size, shuffle=False), validation_steps=X_val.shape[0] / self.batch_size, callbacks=self.model_callbacks) else: history =, y_tr, verbose=self.verbose, batch_size=self.batch_size, epochs=self.number_epochs, validation_data=(X_val, y_val), callbacks=self.model_callbacks) if not self.load_keras_model: validation_loss = history.history['val_loss'] self.loss_history.append(validation_loss) self.min_losses.append(np.min(validation_loss)) if self.output_statistics: self.output_run_statistics('fold') print('Predicting on validation data.') self.oof_train[test_index, :, 0] = model.predict( X_val, batch_size=self.batch_size) if self.verbose: print('Validation split - standard deviation for original target values: {} \n \ for predicted target values: {} \n \n'.format( np.std(y_val), np.std(self.oof_train[test_index, :]))) if self.predict_test and X_test is not None: print('Predicting on test data.') if flow_augment: self.oof_test[:, :, oof_index] = self.flow_predict_test_augment( X_test, model) else: self.oof_test[:, :, oof_index] = model.predict( X_test, batch_size=self.batch_size) oof_index += 1 self.i += 1 if not self.load_keras_model: if self.output_statistics: self.output_run_statistics('fold') if self.predict_test and save_oof: pd.to_pickle(np.array(self.oof_train), 'oof/train/{}_{:.5f}.pkl'.format( self.run_save_name, np.array(self.min_losses).mean(axis=0))) pd.to_pickle(np.array(self.oof_test), 'oof/test/{}_{:.5f}.pkl'.format( self.run_save_name, np.array(self.min_losses).mean(axis=0))) if self.predict_test and X_test is not None: return model, np.array(self.oof_train), np.array(self.oof_test) return model, np.array(self.oof_train).mean(axis=-1)
# 'GROUP', 'TYPE', 'CSS', 'NIP'] cols = [ 'TIMESTAMP_UTC', 'EVENT_SENTIMENT_SCORE', 'EVENT_RELEVANCE', 'CSS', 'NIP' ] df = df[cols] prices = read_data_from_csv(prices_path) prices['Date'] = prices['Date'].apply( lambda x: x[:10]) # ****-**-** format news = adjust_dates_to_trading_dates(df) prices = remove_prices_no_news(prices, news) news = remove_non_trading_dates(prices, news) assert prices.shape[0] == np.unique(news['Date']).shape[0] assert list(np.unique( prices['Date'])) == [str(d) for d in np.unique(news['Date'])] counts, indices = group_by_date(news) assert len(counts) == prices.shape[0] news = weight_news_by_time(news, counts, indices) print(news.head()) X = concat_with_prices(prices, news) print(X.head()) pd.to_pickle(X, '../Data/IBM_X_data.pkl', protocol=4) y = prices['Close'] pd.to_pickle(y, '../Data/IBM_close_data.pkl', protocol=4)
def save_progress(names, new_patents, pickle_names_path, output_path): pd.to_pickle(names, pickle_names_path) store_patents(new_patents, output_path)