def test_rbind_infinite(): def foo(): yield dt.Frame() yield from foo() with pytest.raises(RecursionError): dt.rbind(foo())
def test_rbind_save_to_jay(tempfile_jay): DT0 = dt.Frame(A=range(5), B=[None]*5) DT1 = dt.Frame(A=[7, 11, 95], B=["one", "TWO", "thr33"]) dt.rbind(DT0, DT1).to_jay(tempfile_jay) RES = dt.fread(tempfile_jay) assert_equals(RES, dt.Frame(A=[0, 1, 2, 3, 4, 7, 11, 95], B=[None]*5 + ["one", "TWO", "thr33"]))
def test_rbind_view4(): DT = dt.Frame(A=list('abcdefghijklmnop')) DTempty = dt.Frame(A=[]) DTempty.nrows = 2 assert_equals(dt.rbind(DT[:3, :], DT[:-4:-1, :]), dt.Frame(A=list('abcpon'))) assert_equals(dt.rbind(DT[2:4, :], DTempty, DT[2:5, :]), dt.Frame(A=['c', 'd', None, None, 'c', 'd', 'e']))
def test_rbind_infinite2(): class A: def __next__(self): return self def __iter__(self): return self msg = r"Frame.rbind\(\) expects a list or sequence of Frames as an " \ r"argument; instead item 0 was a <class '.*\.A'>" with pytest.raises(TypeError, match=msg): dt.rbind(A())
def test_rbind_all_stypes(): from datetime import date as d sources = { dt.bool8: [True, False, True, None, False, None], dt.int8: [3, -5, None, 17, None, 99, -99], dt.int16: [None, 245, 872, -333, None], dt.int32: [10000, None, 1, 0, None, 34, -2222222], dt.int64: [None, 9348571093841, -394, 3053867, 111334], dt.float32: [None, 3.3, math.inf, -7.123e20, 34098.79], dt.float64: [math.inf, math.nan, 341.0, -34985.94872, 1e310], dt.str32: ["first", None, "third", "asblkhblierb", ""], dt.str64: ["red", "orange", "blue", "purple", "magenta", None], dt.obj64: [1, False, "yey", math.nan, (3, "foo"), None, 2.33], 'date32': [d(2003, 11, 25), None, d(2017, 5, 16), d(2005, 6, 5)] } cat1 = [ dt.bool8, dt.int8, dt.int16, dt.int32, dt.int64, dt.float32, dt.float64 ] cat2 = [dt.str32, dt.str64] cat3 = ['date32'] cat4 = [dt.obj64] all_stypes = list(sources.keys()) for st1 in all_stypes: for st2 in all_stypes: compatible = \ (st2 in cat1 or st2 in cat4) if st1 in cat1 else \ (st2 in cat2 or st2 in cat4) if st1 in cat2 else \ (st2 in cat3 or st2 in cat4) if st1 in cat3 else \ True f1 = dt.Frame(sources[st1], stype=st1) f2 = dt.Frame(sources[st2], stype=st2) if compatible: f3 = dt.rbind(f1, f2) f1.rbind(f2) frame_integrity_check(f1) frame_integrity_check(f2) frame_integrity_check(f3) assert f1.nrows == len(sources[st1]) + len(sources[st2]) assert f3.shape == f1.shape assert f1.to_list() == f3.to_list() del f1 del f2 del f3 else: with pytest.raises(TypeError): dt.rbind(f1, f2)
def test_rbind_strings_large(): s = "ABCDEFGHIJ" * 110 + "xyz" n = 1000000 assert len(s) * n * 2 > (1 << 31) DT0 = dt.Frame(A=[s] * n) DT1 = dt.rbind(DT0, DT0) assert DT1[-1, 0] == s
def query(self, query, **params): """ Executes the given SQL query against the connected dsatabase. """ chunksize = params.pop("chunksize", 100000) to_pandas = params.pop("to_pandas", True) with self._cursor() as cursor: params = { k: v for k, v in params.items() if k in getargs(cursor.execute).args } cursor.execute(query, **params) fields = [i[0] for i in cursor.description] res = [] while True: result = cursor.fetchmany(chunksize) if not result: break res.append(Frame(result)) frame = rbind(res, bynames=False) if frame.shape == (0, 0): frame = Frame({n: [] for n in fields}) else: frame.names = fields if to_pandas: frame = frame.to_pandas() return frame
def test_rbind_str32_str64(): DT1 = dt.Frame(A=list('abcd'), stype=dt.str32) DT2 = dt.Frame(A=list('efghij'), stype=dt.str64) DT3 = dt.Frame(A=list('klm'), stype=dt.str32) DTR = dt.rbind(DT1, DT2, DT3) # It would be better if the result was str32 assert_equals(DTR, dt.Frame(A=list('abcdefghijklm'), stype=dt.str64))
def test_rbind_all_stypes(): sources = { dt.bool8: [True, False, True, None, False, None], dt.int8: [3, -5, None, 17, None, 99, -99], dt.int16: [None, 245, 872, -333, None], dt.int32: [10000, None, 1, 0, None, 34, -2222222], dt.int64: [None, 9348571093841, -394, 3053867, 111334], dt.float32: [None, 3.3, math.inf, -7.123e20, 34098.79], dt.float64: [math.inf, math.nan, 341.0, -34985.94872, 1e310], dt.str32: ["first", None, "third", "asblkhblierb", ""], dt.str64: ["red", "orange", "blue", "purple", "magenta", None], dt.obj64: [1, False, "yey", math.nan, (3, "foo"), None, 2.33], } all_stypes = list(sources.keys()) for st1 in all_stypes: for st2 in all_stypes: f1 = dt.Frame(sources[st1], stype=st1) f2 = dt.Frame(sources[st2], stype=st2) f3 = dt.rbind(f1, f2) f1.rbind(f2) frame_integrity_check(f1) frame_integrity_check(f2) frame_integrity_check(f3) assert f1.nrows == len(sources[st1]) + len(sources[st2]) assert f3.shape == f1.shape assert f1.to_list() == f3.to_list() del f1 del f2 del f3
def test_debug_logger_object(): assert dt.options.debug.logger is None logger = SimpleLogger() with dt.options.debug.context(logger=logger, enabled=True, report_args=True): assert dt.options.debug.logger is logger assert dt.options.debug.enabled is True DT = dt.rbind([]) assert "datatable.rbind([]) {" in logger.msg assert re.search(r"} # \d+(?:\.\d+)?(?:[eE][+-]?\d+)? s", logger.msg) logger.msg = "" with pytest.raises(TypeError): dt.rbind(4) assert "datatable.rbind(4) {" in logger.msg assert re.search(r"} # \d+(?:\.\d+)?(?:[eE][+-]?\d+)? s \(failed\)", logger.msg)
def test_fread_from_glob(tempfile): base, ext = os.path.splitext(tempfile) if not ext: ext = ".csv" pattern = base + "*" + ext tempfiles = ["".join([base, str(i), ext]) for i in range(10)] try: for j in range(10): with open(tempfiles[j], "w") as f: f.write("A,B,C\n0,0,0\n%d,%d,%d\n" % (j, j * 2 + 1, (j + 3) * 17 % 23)) res = dt.fread(pattern) assert len(res) == 10 assert set(res.keys()) == set(tempfiles) for f in res.values(): assert isinstance(f, dt.Frame) frame_integrity_check(f) assert f.names == ("A", "B", "C") assert f.shape == (2, 3) df = dt.rbind(*[res[f] for f in tempfiles]) frame_integrity_check(df) assert df.names == ("A", "B", "C") assert df.shape == (20, 3) assert df.to_list() == [ [0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9], [0, 1, 0, 3, 0, 5, 0, 7, 0, 9, 0, 11, 0, 13, 0, 15, 0, 17, 0, 19], [0, 5, 0, 22, 0, 16, 0, 10, 0, 4, 0, 21, 0, 15, 0, 9, 0, 3, 0, 20] ] finally: for f in tempfiles: os.remove(f)
def create_data(): folder_path = '/home/ubuntu/data/Kaggle/IEEEFraud' # Modify as needed train_identity_file = os.path.join(folder_path, 'train_identity.csv') test_identity_file = os.path.join(folder_path, 'test_identity.csv') train_transaction_file = os.path.join(folder_path, 'train_transaction.csv') test_transaction_file = os.path.join(folder_path, 'test_transaction.csv') if not (os.path.isfile(train_identity_file and os.path.isfile( test_identity_file and os.path.isfile(train_transaction_file and os.path.isfile(test_transaction_file))))): return [] train_identity = dt.fread(train_identity_file) test_identity = dt.fread(test_identity_file) train_transaction = dt.fread(train_transaction_file) test_transaction = dt.fread(test_transaction_file) target = 'isFraud' train_identity.key = 'TransactionID' test_identity.key = 'TransactionID' # Join identity into transactions train = train_transaction[:, :, dt.join(train_identity)] test = test_transaction[:, :, dt.join(test_identity)] # Combine train and test for further processing X = dt.rbind([train, test], force=True) # Turn integer time column into datetime string with proper format startdate = datetime.datetime.strptime('2017-11-30', "%Y-%m-%d") pd_time = X[:, 'TransactionDT'].to_pandas()['TransactionDT'].apply( lambda x: (startdate + datetime.timedelta(seconds=x))) X[:, 'TransactionDT_str'] = dt.Frame( pd_time.apply( lambda x: datetime.datetime.strftime(x, "%Y-%m-%d %H:%M:%S"))) # Month - to be used as fold column (that way get cross-validation without shuffling future/past too much, minimize overlap between folds) fold_column = 'fold_column' X[:, fold_column] = dt.Frame(pd_time.dt.month + (pd_time.dt.year - 2017) * 12) # Create start times (in secs) for Dx features (which are growing linearly over time) for i in range(1, 16): X[:, 'Trans_D%d_start' % i] = dt.Frame( np.floor(X[:, 'TransactionDT'].to_numpy().ravel() / (24 * 60 * 60)) - X[:, 'D%d' % i].to_numpy().ravel()) # re-order names first_names = [target, fold_column] names = first_names + [x for x in X.names if x not in first_names] X = X[:, names] # Split back into train and test train = X[:train_transaction.nrows, :] test = X[train_transaction.nrows:, :] return {'IEEE.train': train, 'IEEE.test': test}
def test_rbind_different_types_force(): DT1 = dt.Frame(A=[1, 4, 77]), DT2 = dt.Frame(A=["Hi", "there", None]) DT3 = dt.Frame(A=['2010-11-01', '2020-08-14', '2022-12-12'], type='date32') with pytest.raises(TypeError): dt.rbind(DT1, DT2) with pytest.raises(TypeError): dt.rbind(DT1, DT3) with pytest.raises(TypeError): dt.rbind(DT3, DT2) assert_equals(dt.rbind(DT1, DT2, force=True), dt.Frame(A=["1", "4", "77", "Hi", "there", None])) assert_equals( dt.rbind(DT1, DT3, force=True), dt.Frame(A=["1", "4", "77", "2010-11-01", "2020-08-14", "2022-12-12"])) assert_equals( dt.rbind(DT2, DT3, force=True), dt.Frame( A=["Hi", "there", None, "2010-11-01", "2020-08-14", "2022-12-12"]))
def pd_dt_concat(frames, axis=0): """ Concatenate sequence of datatable Frames or pandas DataFrames `frames` along `axis` (0 means rows, 1 means columns). """ if USE_DT: if axis == 0: return dt.rbind(*frames) elif axis == 1: return dt.cbind(*frames) else: raise ValueError('invalid axis:', axis) else: return pd.concat(frames, axis=axis)
def test_groupby_large_random_integers(seed): random.seed(seed) ngrps1 = random.choice([1, 1, 2, 2, 2, 3, 4, 5]) n0 = 1 << random.choice([1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6, 7]) chunks = ([random.sample(range(n0), random.randint(1, n0))] + [random.sample([0] * 100 + list(range(256)), random.randint(1, 20)) for i in range(ngrps1)]) n = int(random.expovariate(0.0001)) + 10 sample = [sum(random.choice(chunks[i]) << (8*i) for i in range(len(chunks))) for _ in range(n)] nuniques = len(set(sample)) f0 = dt.Frame(sample) assert f0.nunique1() == nuniques f1 = dt.rbind(*([f0] * random.randint(2, 20))) assert f1.nunique1() == nuniques
def create_data(X: dt.Frame = None): # to be usd with models.algorithms.logistic_regression.py with # _kaggle = True # _kaggle_features = True # _kaggle_mode = True train = dt.fread("/home/jon/kaggle/cat/inputs/train.csv.zip") train_orig = dt.Frame(train) train['sample_weight'] = dt.Frame(np.array([1.0] * train.shape[0])) test = dt.fread("/home/jon/kaggle/cat/inputs/test.csv.zip") test_orig = dt.Frame(test) test['sample_weight'] = dt.Frame(np.array([0.0] * test.shape[0])) test['target'] = dt.Frame(np.array([0.0] * test.shape[0])) final = dt.rbind([train, test]) return { 'catmerged': final, 'cattrain': train_orig, 'cattest': test_orig }
def create_data(X: dt.Frame = None): # to be used with models.algorithms.logistic_regression.py with # _kaggle = True # _kaggle_features = True # _kaggle_mode = True path = "/home/jon/kaggle/cat/inputs/" if not os.path.exists(path): return [] train = dt.fread(os.path.join(path, "train.csv.zip")) train_orig = dt.Frame(train) train['sample_weight'] = dt.Frame(np.array([1.0] * train.shape[0])) test = dt.fread(os.path.join(path, "test.csv.zip")) test_orig = dt.Frame(test) test['sample_weight'] = dt.Frame(np.array([0.0] * test.shape[0])) test['target'] = dt.Frame(np.array([0] * test.shape[0], dtype=int)) final = dt.rbind([train, test]) return { 'catmerged': final, 'cattrain': train_orig, 'cattest': test_orig }
def test_fread_from_glob(tempfile): base, ext = os.path.splitext(tempfile) if not ext: ext = ".csv" pattern = base + "*" + ext tempfiles = ["".join([base, str(i), ext]) for i in range(10)] try: for j in range(10): with open(tempfiles[j], "w") as f: f.write("A,B,C\n0,0,0\n%d,%d,%d\n" % (j, j * 2 + 1, (j + 3) * 17 % 23)) res = dt.iread(pattern) assert res.__class__.__name__ == "read_iterator" res = list(res) assert len(res) == 10 assert set(DTj.source for DTj in res) == set(tempfiles) # The glob pattern tempfile*.csv may have returned the files in a # shuffled order, need to sort them back from 0 to 9: res = sorted(res, key=lambda DTj: DTj.source) for j in range(10): DTj = res[j] assert isinstance(DTj, dt.Frame) frame_integrity_check(DTj) assert DTj.names == ("A", "B", "C") assert DTj.shape == (2, 3) df = dt.rbind(res) frame_integrity_check(df) assert df.names == ("A", "B", "C") assert df.shape == (20, 3) assert df.to_list() == [ [0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9], [0, 1, 0, 3, 0, 5, 0, 7, 0, 9, 0, 11, 0, 13, 0, 15, 0, 17, 0, 19], [0, 5, 0, 22, 0, 16, 0, 10, 0, 4, 0, 21, 0, 15, 0, 9, 0, 3, 0, 20] ] finally: for f in tempfiles: os.remove(f)
def load_table(name, data_dir): """ Load all PSet tables with name into a datatable, dropping any duplicate rows. @param name: [`string`] The name of the table @param data_dir: [`string`] File path to the directory with all PSet tables @return: [`datatable.Frame`] A datatable containing all rows from all PSets """ # Get all files files = glob.glob(os.path.join(data_dir, '**', f'*{name}.csv')) # Filter so that file path are '{data_dir}/{pset}/{pset}_{name}.csv' files = [ file_name for file_name in files if re.search(data_dir + r'/(\w+)/\1_' + name + '.csv$', file_name) ] # Read and concatenate tables df = rbind(*iread(files, sep=',')) # Replace any empty strings with None/NA df.replace("", None) # Drop duplicates # (groups by all columns and selects only the first row from each group) df = df[0, :, by(df.names)] return df
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config import bz2 def extract_bz2(file, output_file): zipfile = bz2.BZ2File(file) data = zipfile.read() open(output_file, 'wb').write(data) temp_path = os.path.join(user_dir(), "recipe_tmp", "airlines") os.makedirs(temp_path, exist_ok=True) dt.options.nthreads = 8 # specify which years are used for training and testing training = list(range(2005, 2008)) testing = [2008] # download and unzip files files = [] for f in ["%d.csv.bz2" % year for year in training + testing]: link = AirlinesData.base_url + "%s" % f file = download(link, dest_path=temp_path) output_file = file.replace(".bz2", "") if not os.path.exists(output_file): extract_bz2(file, output_file) files.append(output_file) # parse with datatable X = dt.rbind(*[dt.fread(x) for x in files]) # add date date_col = 'Date' X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[ 'DayofMonth'] cols_to_keep = ['Date'] # add number of flights in/out for each airport per given interval timeslice_mins = 60 for name, new_col, col, group in [ ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"), ("in", "CRSArrTime_mod", "CRSArrTime", "Dest") ]: X[:, new_col] = X[:, dt.f[col] // timeslice_mins] group_cols = [date_col, group, new_col] new_name = 'flights_%s_per_%d_min' % (name, timeslice_mins) flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)] flights.key = group_cols cols_to_keep.append(new_name) X = X[:, :, dt.join(flights)] # select flights leaving from SFO only X = X[dt.f['Origin'] == 'SFO', :] # Fill NaNs in DepDelay column X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0 # create binary target column depdelay_threshold_mins = 15 target = 'DepDelay%dm' % depdelay_threshold_mins X[:, target] = dt.f['DepDelay'] > depdelay_threshold_mins cols_to_keep.extend([ target, 'Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'FlightNum', 'TailNum', 'CRSElapsedTime', 'Origin', 'Dest', 'Distance', # Leaks for delay # 'DepTime', # 'ArrTime', #'CRSArrTime', # 'ActualElapsedTime', # 'AirTime', #'ArrDelay', #'DepDelay', # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay', # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay', ]) X = X[:, cols_to_keep] # Join in some extra info join_files = [('UniqueCarrier', 'carriers.csv', 'Code'), ('Origin', 'airports.csv', 'iata'), ('Dest', 'airports.csv', 'iata'), ('TailNum', 'plane-data.csv', 'tailnum')] for join_key, file, col in join_files: file = download( 'https://0xdata-public.s3.amazonaws.com/data_recipes_data/%s' % file, dest_path=temp_path) X_join = dt.fread(file, fill=True) X_join.names = {col: join_key} X_join.names = [join_key] + [ join_key + "_" + x for x in X_join.names if x != join_key ] X_join.key = join_key X = X[:, :, dt.join(X_join)] del X[:, join_key] split = True if not split: filename = os.path.join( temp_path, "flight_delays_data_recipe_%d-%d.csv" % (min(training), max(testing))) X.to_csv(filename) return filename else: # prepare splits (by year) and create binary .jay files for import into Driverless AI output_files = [] for condition, name in [ ((min(training) <= dt.f['Year']) & (dt.f['Year'] <= max(training)), 'training'), ((min(testing) <= dt.f['Year']) & (dt.f['Year'] <= max(testing)), 'test'), ]: X_split = X[condition, :] filename = os.path.join( temp_path, "augmented_flights_%s-%d_%s.csv" % (X_split[:, 'Year'].min1(), X_split[:, 'Year'].max1(), name)) X_split.to_csv(filename) output_files.append(filename) return output_files
def test_rbind_modulefn(): f0 = dt.Frame([1, 5409, 204]) f1 = dt.Frame([109813, None, 9385]) f3 = dt.rbind(f0, f1) f3.internal.check() assert f3.topython()[0] == f0.topython()[0] + f1.topython()[0]
def test_rbind_modulefn(): f0 = dt.Frame([1, 5409, 204]) f1 = dt.Frame([109813, None, 9385]) f3 = dt.rbind(f0, f1) frame_integrity_check(f3) assert f3.to_list()[0] == f0.to_list()[0] + f1.to_list()[0]
} # pset_tables: ["dose_response", "drug", "datasets_cells", # "dataset_statistics", "cell", "drug_annotation", "gene_drug", # "profile", "dataset", "mol_cell", "gene_annotation", "dataset_cell", # "experiment", "tissue", "gene"]' pset_name = psets[3] # GDSC_v1 # -- Read in a single .csv experiment = fread( os.path.join(data_dir, pset_name, pset_tables[pset_name][-3], f'*{pset_tables[pset_name][-3]}*.csv')) # -- Read in multiple .csv files and make a single Frame dose_response = rbind(*iread( os.path.join(data_dir, pset_name, pset_tables[pset_name][0], f'*{pset_tables[pset_name][0]}*.csv'))) # Can use pattern matching to read in multiple files; ** will match any number of subdirectories # Should make path parsing code much more compact all_cell_tables = rbind( *iread(os.path.join(data_dir, '**', 'cell', '*cell.csv'))) # -- Write to csv dose_response.to_csv( os.path.join(output_dir, f'{pset_tables[pset_name][0]}.csv')) # -- Select (of the form df[filter, select, ...]) # f is for Frame and references variables within the Frame object (i.e., columns) dose_response[:, [f.id, f.experiment_id]]
def test_rbind_void(): DT1 = dt.Frame([None] * 10) DT2 = dt.Frame([None] * 3) res = dt.rbind(DT1, DT2) assert res.types == [dt.Type.void]
def test_not_inplace(): dt0 = dt.Frame({"A": [5, 1], "B": [4, 4]}) dt1 = dt.Frame({"A": [22], "B": [11]}) dtr = dt.rbind(dt0, dt1) assert_equals(dtr, dt.Frame({"A": [5, 1, 22], "B": [4, 4, 11]})) assert_equals(dt0, dt.Frame({"A": [5, 1], "B": [4, 4]}))
def test_issue2621_a(): # Rbinding an iterator of frames should produce correct result RES = dt.rbind(dt.Frame(A=[i], B=['hey']) for i in range(10)) assert_equals(RES, dt.Frame(A=range(10), B=['hey'] * 10))
def test_issue2621_b(): src = """c1, c2, c3 11, 2, 3""" RES = dt.rbind(dt.iread([src, src])) assert_equals(RES, dt.Frame(c1=[11, 11], c2=[2, 2], c3=[3, 3]))
# counts by target groups g = X[:, {"count": count()}, by(target_col)] if not g.shape[1] == 2: raise ValueError( "Not a binary target - target column must contain exactly 2 values.") # find sizes and target values for minority and majority class partitions n_minority = g[:, min(f.count)][0, 0] n_majority = g[:, max(f.count)][0, 0] target_minority = g[f.count == n_minority, target_col][0, 0] target_majority = g[f.count == n_majority, target_col][0, 0] # validate that times indeed downsamples majority class if times * n_minority >= n_majority: raise ValueError( "Downsampling coefficient `times` is too large: downsampled dataset results in inflated majority class." ) # downsample with pandas frame df_majority = X[f[target_col] == target_majority, :].to_pandas() df_majority_downsampled = resample(df_majority, replace=False, n_samples=n_minority * times, random_state=random_seed) return { new_dataset_name: rbind(X[f[target_col] == target_minority, :], dt.Frame(df_majority_downsampled)) }
def test_bool8_small_view(): DT0 = dt.Frame([True, False, False, True, None, True, True, None]) DT1 = dt.Frame([None, None, False, False, True, True, True, True]) DTS = dt.rbind(DT0[::2, :], DT0[1::2, :]).sort(0) assert_equals(DTS, DT1)
})[:, { 'frequency': count(), 'url': None, 'status': 'not.found', 'action': None }, by(f.hpo)] # set url for HPO code if it exists hpoCodes['url'] = dt.Frame([ f'http://purl.obolibrary.org/obo/{d}' for d in hpoCodes['hpo'].to_list()[0] ]) # manually check each link and search for code. Either add the code or follow # up with SolveRD project data coordinators. rbind(fread('data/unknown_hpo_codes.csv'), hpoCodes).to_csv('data/unknown_hpo_codes.csv') # ~ 3c ~ # Investivate 'unavailable' subjects (i.e., subjects that do not exist in # the current freeze). If subjects were added to 'unavailble', pull # subjectIDs from another freeze(s) to see these subjects exist in another freeze # otherFreezeIDs = rd3tools.flatten_attr( rd3.get('rd3_freeze1_subject', attributes='id', batch_size=10000), 'id') novelOmicsIDs = rd3tools.flatten_attr( rd3.get('rd3_novelomics_subject', attributes='id', batch_size=10000), 'id') unknownSubjects = dt.Frame(unavailable, types={ 'id': str, 'dateofBirth': str,