def __init__(self, tep_file_fault_free, tep_file_faulty, is_test=False, normalize=True): """ Args: csv_file (string): path to csv file normalize (bool): whether to normalize the data in [-1,1] """ if "sampled" in tep_file_fault_free: df = pd.read_pickle(tep_file_fault_free) else: fault_free = py.read_r(tep_file_fault_free) faulty = py.read_r(tep_file_faulty) if is_test: df = pd.concat([ fault_free['fault_free_testing'], faulty['faulty_testing'] ]) else: df = pd.concat([ fault_free['fault_free_training'], faulty['faulty_training'] ]) # todo: add conditioning on fault number, now we generate only the normal condition df = df[(df.faultNumber == 0)] work_with_columns = [ 'faultNumber', 'simulationRun', 'sample', 'xmeas_1' ] raw_data = torch.from_numpy( np.expand_dims( np.array([ g[1]["xmeas_1"] for g in df[work_with_columns].groupby( ['faultNumber', 'simulationRun']) ]), -1)).float() # for checking if logic above is working properly assert np.allclose( raw_data.squeeze()[0, :].numpy(), df[(df.simulationRun == 1) & (df.faultNumber == 0)].xmeas_1.values) self.data = self.normalize(raw_data) if normalize else raw_data self.seq_len = raw_data.size(1) # Estimates distribution parameters of deltas (Gaussian) from normalized data original_deltas = raw_data[:, -1] - raw_data[:, 0] self.original_deltas = original_deltas self.or_delta_max, self.or_delta_min = original_deltas.max( ), original_deltas.min() deltas = self.data[:, -1] - self.data[:, 0] self.deltas = deltas self.delta_mean, self.delta_std = deltas.mean(), deltas.std() self.delta_max, self.delta_min = deltas.max(), deltas.min()
def load_1ddata(filename0, filename1): assert filename0[:10] == filename1[:10], 'Different date in two dataset!' df0 = pyreadr.read_r('%s%s' % (input_dir0, filename0))['indicator_1m'] df0['Symbol'] = [i[2:] for i in df0['Symbol']] df1 = pyreadr.read_r('%s%s' % (input_dir1, filename1))['data_1m'].iloc[:, :-4] df1['Time'] = df1['minute'] del df1['minute'] tdf = df1.merge(df0, left_on=['Date', 'Time', 'Symbol'], right_on=['Date', 'Time', 'Symbol'], how='inner') return tdf
def load_metadata(self, filename=None, alt_url=None, out_dir=None): """ Load the AURN metadata from file or URL. If filename is None, will use the metadata stored at the URL: alt_url Otherwise, will load from URL: alt_url Dependencies: Path Args: filename: (string) Valid file name of existing AURN metadata R file, or None alt_url: (string) Valid URL pointing to AURN metadata downloadable source, or None out_dir: (string) Directory to store .RData file if alt_url used. Returns: Dataframe containing the downloaded data """ assert filename is None or Path(filename).is_file(), ValueError( 'Invalid filename: {}'.format(filename)) # Has a filename been entered and does the file exist? if filename is not None: print("Metadata file {} exists so will use this".format(filename)) filename = Path(filename) # Read the RData file into a Pandas dataframe try: print('Reading filename {} into dataframe.'.format( filename.name)) return pyreadr.read_r(str(filename)) except Exception as err: raise ValueError( 'Error reading into dataframe from R file: {} . {}'.format( filename, err)) # No filename so try URL if one exists, or raise error assert alt_url is not None, ValueError( 'no filename or url'.format(filename)) # Does the URL alternative exist and does it work print("\nDownloading data file using url {}".format(alt_url)) try: print('\nLoading metadata file from url') filename = Path(wget.download(alt_url, out_dir)) return pyreadr.read_r(str(filename)) except Exception as err: raise ValueError( 'Error obtaining metadata file from url: {}. {}'.format( alt_url, err))
def __init__(self, tep_file_fault_free, tep_file_faulty, window_size=10, is_test=False, transform=None): self.window_size = window_size self.is_test = is_test self.transform = transform if "sampled" in tep_file_fault_free: self.df = pd.read_pickle(tep_file_fault_free) else: fault_free = py.read_r(tep_file_fault_free) faulty = py.read_r(tep_file_faulty) if is_test: self.df = pd.concat([ fault_free['fault_free_testing'], faulty['faulty_testing'] ]) else: self.df = pd.concat([ fault_free['fault_free_training'], faulty['faulty_training'] ]) # cause the dataset has the broken index self.df = self.df \ .sort_values(by=["faultNumber", "simulationRun", "sample"], ascending=True) \ .reset_index(drop=True) self.class_count = len(self.df.faultNumber.value_counts()) self.runs_count = self.df.faultNumber.unique( ).shape[0] * self.df.simulationRun.unique().shape[0] self.sample_count = 960 if is_test else 500 self.shots_count = self.sample_count - self.window_size + 1 # making labels according to TEP self.labels = self.df.loc[:, ["faultNumber", "sample"]] self.labels.loc[:, "label"] = self.labels.loc[:, "faultNumber"].astype( 'long') if is_test: self.labels.loc[(self.labels.label != 0) & (self.labels["sample"] <= 160), "label"] = 0 else: self.labels.loc[(self.labels["label"] != 0) & (self.labels["sample"] <= 20), "label"] = 0 self.features_count = self.df.shape[1] - 3
def rdata_data(self, filename, is_int=False, is_float=False): to_return = [] data = pyreadr.read_r(filename) for k in data.keys(): for r in range(0, len(data[k].index)): row = data[k].iloc[r, :] if r == 0: for i in range(0, len(row) - 1): D = {} to_return.append(D) for j in range(1, len(row)): if is_int or is_float: try: if not math.isnan(float(row[j])): if is_float: to_return[j - 1][int(row[0])] = float( row[j]) else: to_return[j - 1][int(row[0])] = float( row[j]) except: if len(row[j]) != 0: to_return[j - 1][int(row[0])] = float( row[j][1:]) else: to_return[j - 1][int(row[0])] = row[j] return to_return
def import_external_file_as_dataframe(public_file_url, data_name, extension, import_method="feather"): absolute_dir = pathlib.Path().resolve() file_name = data_name + extension destination_path = absolute_dir.parent.joinpath("data", "public_data", file_name) download_a_file_from_url(public_file_url, destination_path) if (import_method == "rdata"): return import_rda_file_by_rdata(import_path=destination_path, dataframe_name=data_name) # this case returns error "ArrowInvalid: Not a Feather V1 or Arrow IPC file" when the file format is not feather elif (import_method == "feather"): return feather.read_dataframe(destination_path)[data_name] elif (import_method == "pyreadr"): return pyreadr.read_r(str(destination_path))[data_name] else: print("not registered import method. returns None") sys.exit(0)
def load_sites_and_obtain_their_grid_locations(wrf_in,sensors_file,sensor_file_type): # open the sensor dataset if(sensor_file_type == 'CSV'): sens = pd.read_csv(sensors_file,usecols=['long','lat','sensor_name']) elif(sensor_file_type == 'RDATA'): metadata = pyreadr.read_r(sensors_file.as_posix()) sens = metadata['AURN_metadata'][['site_id','latitude','longitude']].drop_duplicates() else: print('Sensor file type not recognised: {}'.format(sensor_file_type)) print('Should be CSV or RDATA, please amend as appropriate.') quit() # get the indexes from the wrf file with salem.open_wrf_dataset(wrf_in) as ds: sens['iarray'],sens['jarray'] = ds.salem.grid.transform(sens['longitude'],sens['latitude'],nearest=True) #%% check to make sure that all stations are within our model domain - drop those that aren't if any(sens['iarray']>ds.dims['west_east']) or any(sens['jarray']>ds.dims['south_north']): print('dropping these stations outside of the model domain:') print(sens[sens['jarray']>ds.dims['south_north']]) print(sens[sens['iarray']>ds.dims['west_east']]) sens = sens[sens['jarray']<=ds.dims['south_north']] sens = sens[sens['iarray']<=ds.dims['west_east']] return(sens)
def rds_to_html(file_path): app_rds = pyreadr.read_r(file_path) app_df = app_rds[None] app_html = app_df.iloc[0, 0] return app_html
def generateTrainPredictFromRDS(inFile, negProtein=None): """ This function reads the rds file and generates training/predict set using the given symbols and labels. RDS files are generated using R code. """ logging.info('Loading data from RDS file to create a dictionary') rdsdata = pyreadr.read_r(inFile) df = rdsdata[None] trainData[True] = set(df.loc[(df['Y'] == 'pos') & (df['subset'] == 'train')]["id1"]) trainData[False] = set(df.loc[(df['Y'] == 'neg') & (df['subset'] == 'train')]["id1"]) # trainData[True] = set(np.where(rdsdata[None]['Y']=='pos')[0]) # trainData[False] = set(np.where(rdsdata[None]['Y']=='neg')[0]) # if negative label was not provided, use default protein ids if negProtein is not None: trainData[False].update(negProtein) # determine train and predict set predictProteinSet = allProteinIds.difference(trainData[True]) predictProteinSet = predictProteinSet.difference(trainData[False]) predictData['unknown'] = predictProteinSet logging.info( 'Count of positive labels: {0}, count of negative labels: {1}'.format( len(trainData[True]), len(trainData[False]))) if len(trainData[True]) == 0 or len(trainData[False]) == 0: logging.error('ML codes cannot be run with one class') exit() else: return trainData, predictData
def readremoteRDSdata(url=''): """ Read an R RDS file from a remote Internet repository by URL Parameters ---------- url : string The raw-formatted RDS file to load in. The default is ''. Raises ------ Exception Throws an exception if scratch disk is unavailable or data is inaccessible. Returns ------- pandas dataframe A pandas dataframe that has the uncompressed data from the URL.. """ scratch = '' try: scratch = rdshandling.getfilename(url) + '.rda' except: raise Exception("Filename issue") local = None result = None try: local = pyreadr.download_file(url, scratch) result = pyreadr.read_r(local) except Exception as e: print(e) return result[None]
def test_plot_mse_epoch_small(self): ''' test plot&dimension of data ''' try: newres=pyreadr.read_r(projresdir+plotdata_r) oldres=pyreadr.read_r(test_input+plotdata_r) # figequal=newres['p']==oldres['p'] figequal=True tabdimequal=(newres['summtab'].shape[0]==oldres['summtab'].shape[0] and newres['msetablong'].shape==oldres['msetablong'].shape) print("summtab_size %s msetablong_size %s\n" % (newres['summtab'].shape,newres['msetablong'].shape,)) if figequal and tabdimequal: self.assertTrue(True) else: self.assertTrue(False) except: self.assertTrue(False)
def read_rds(filepath): """Read an RDS-format matrix into a Pandas dataframe. Location can be data, scratch, or results. Index is populated from first column""" raw_df = pyreadr.read_r(filepath)[None] if raw_df.isnull().values.any(): raise ValueError("NaN's were found in the data matrix.") return raw_df.set_index(raw_df.columns[0], drop=True)
def rds_to_html(file_path): f_name = str(Path(file_path)) app_rds = pyreadr.read_r(f_name) app_df = app_rds[None] app_html = app_df.iloc[0, 0] return app_html
def rds_to_html(file): f_name = os.path.join(rds_directory_full_path, file) app_rds = pyreadr.read_r(f_name) app_df = app_rds[None] app_html = app_df.iloc[0, 0] return app_html
def read_projects_data(): result = pd.DataFrame() for week in weeks: projects_path, _ = week_paths[week] p_df = pyreadr.read_r(projects_path)[None] result = result.append(p_df[['project_slug', 'project_name', 'creator_name']]) result.drop_duplicates(inplace=True) return result
def load_RData(): # Reading train data in .R format train_data_0 = py.read_r("data/RData/TEP_FaultFree_Training.RData") train_data_1 = py.read_r("data/RData/TEP_Faulty_Training.RData") # Reading test data in .R format test_data_0 = py.read_r("data/RData/TEP_FaultFree_Testing.RData") test_data_1 = py.read_r("data/RData/TEP_Faulty_Testing.RData") print("Finish reading data.") # Concatinating the train and the test dataset tr = [train_data_0['fault_free_training'], train_data_1['faulty_training']] train = pd.concat(tr) # Train dataframe ts = [test_data_0['fault_free_testing'], test_data_1['faulty_testing']] test = pd.concat(ts) # Test dataframe # Save the datasets into csv file train.to_csv("data/train.csv") test.to_csv("data/test.csv")
def load_population(rds_file: str): """Loads population data from RDS file. :param rds_file: and RDS file containing a data.frame with columns 'age', 'LA.code', 'n' """ raw = pyr.read_r(rds_file) df = list(raw.values())[0] df = df.sort_values(by=['LA.code', 'age']) return df[['LA.code', 'name', 'Area.name.2', 'age', 'n']]
def read_data(root): #read dataset result = pyreadr.read_r(root+'TNBC_data/TCGA_TNBC112.RData') print(result.keys()) # let's check what objects we got gene_df = result["TCGA"] # extract the pandas data frame for object df1 return gene_df.T
def load_age_mixing(rds_file: str): """Loads age mixing matrix from R. :param rds_file: a .rds file containing an R data.frame with mixing matrix """ raw = pyr.read_r(rds_file) K = list(raw.values())[0] age_groups = K.columns return K.to_numpy(dtype=np.float32), age_groups
def to_pandas(filename): print(filename) df = pyreadr.read_r(filename)[None] print(df) df = df.set_index(df.columns[0]) old_index = df.index.tolist() new_index = list(map(lambda x: x.split('_')[-1], old_index)) rename_dict = dict(zip(old_index, new_index)) df = df.rename(index=rename_dict) return df.T # (samples, genes)
def read_Rda(path_to_Rda, verbose=False): Rdata = pyreadr.read_r(path_to_Rda) # also works for Rds while len(Rdata.keys()) == 1: keys = np.array(list(Rdata.keys())) Rdata = Rdata[keys[0]] npdata = np.array(Rdata, dtype=np.double) data_size = npdata.shape if verbose: print("Sample size is ", npdata.shape) return npdata, data_size
def schedule(year: int) -> pd.DataFrame: """ Get the schedule for a given year """ schedule_data_file: str = "/".join( [NFLFASTR_DATA_DIR, "schedules", f"sched_{year}.rds"]) r_data: OrderedDict = pyreadr.read_r(schedule_data_file) assert set(r_data.keys()) == set([None]), "Unexpected keys" data: pd.DataFrame = r_data[None] return data
def __init__(self, filename, target = "target", ignore = []): #read data file data = pyreadr.read_r(filename)[None] #convert to torch tensors self.target = torch.tensor(data[target].values, dtype = torch.float32).unsqueeze(1) self.predictors = torch.tensor(np.array(data.drop(columns = target)), dtype = torch.float32) print("Data read successfully!")
def download_schedule_data(year): try: rds_file = pyreadr.download_file( f'https://github.com/nflverse/nflfastR-data/blob/master/schedules/sched_{str(year)}.rds?raw=True', f'data/sched_{str(year)}.rds') rds_df = pyreadr.read_r(rds_file) df = rds_df[None] df.to_csv(f'data/sched_{year}.csv.gz', compression='gzip') # delete the rds file os.remove(rds_file) except: print(f"couldnt read schedule data for {year}")
def read_file(cluster_dir, test): cd = os.listdir(cluster_dir) clusters = [] for cluster in cd: if cluster.endswith("_filter_" + test + ".rds"): pyr = pyreadr.read_r(os.path.join(cluster_dir, cluster)) df1 = pyr[None] cluster_list = list(df1['gene']) cnum = str(''.join(filter(str.isdigit, cluster))) f = ["Cluster" + cnum, "CLUSTER" + cnum] + cluster_list clusters.append(f) return (clusters)
def loadData(): # OriginalRData result = pyreadr.read_r("jobvite_1_2_merged_anonymized.RData") # output: odict_keys(['anon']) merged_anonymized = result["anon"] #print(merged_anonymized.shape) # Remove Duplicates, if someone was hired for one job and rejected for another keep only hired: merged_anonymized.sort_values(by=['Jobvite.ID', 'Hired'], ascending=[True, False], inplace=True) merged_anonymized.drop_duplicates(subset='Jobvite.ID', keep='first', inplace=True) #print(merged_anonymized.shape) return merged_anonymized
def convert(filename): # take the file name as input input_rds = filename # Read the R-database file into an Ordered Dictionary result = pyreadr.read_r(input_rds) # put the data into a pandas dataframe df = result[None] # output the csv df.to_csv(f"{input_rds[:-4]}.csv")
def load_UNSC(): """ Function to load raw UNSC RData Returns: ids (list[str]): list of unique IDs for speeches flat_text (list[str]): list of raw speech text """ data = pyreadr.read_r("./data/UNSC/docs.RData")["raw_docs"] ids = data["doc_id"].tolist() flat_text = data["text"].tolist() return ids, flat_text
def italk_raw(): """ Returns a ``pandas.DataFrame`` with the iTalk database contents. """ url = urls.urls['italk']['url'] c = curl.Curl(url, silent=False, large=True) rdata_path = c.fileobj.name c.fileobj.close() rdata = pyreadr.read_r(rdata_path)['database'] return rdata
def gen_map(): mapdata_d = pyreadr.read_r( '../datasets/mapdata_copyright_openstreetmap_contributors.Rds') mapdata = np.reshape(to_dec(list(mapdata_d.values())[0].to_numpy()), (-1, 1311)).astype(float) aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1] lon_lat_box = (-88, -87.5, 41.6, 42.1) plt.figure(figsize=(10, 14)) plt.imshow(mapdata, cmap=plt.get_cmap('gray'), extent=lon_lat_box, aspect=aspect)