Ejemplo n.º 1
0
def test_rbind_infinite():
    def foo():
        yield dt.Frame()
        yield from foo()

    with pytest.raises(RecursionError):
        dt.rbind(foo())
Ejemplo n.º 2
0
def test_rbind_save_to_jay(tempfile_jay):
    DT0 = dt.Frame(A=range(5), B=[None]*5)
    DT1 = dt.Frame(A=[7, 11, 95], B=["one", "TWO", "thr33"])
    dt.rbind(DT0, DT1).to_jay(tempfile_jay)
    RES = dt.fread(tempfile_jay)
    assert_equals(RES, dt.Frame(A=[0, 1, 2, 3, 4, 7, 11, 95],
                                B=[None]*5 + ["one", "TWO", "thr33"]))
Ejemplo n.º 3
0
def test_rbind_view4():
    DT = dt.Frame(A=list('abcdefghijklmnop'))
    DTempty = dt.Frame(A=[])
    DTempty.nrows = 2
    assert_equals(dt.rbind(DT[:3, :], DT[:-4:-1, :]),
                  dt.Frame(A=list('abcpon')))
    assert_equals(dt.rbind(DT[2:4, :], DTempty, DT[2:5, :]),
                  dt.Frame(A=['c', 'd', None, None, 'c', 'd', 'e']))
Ejemplo n.º 4
0
def test_rbind_infinite2():
    class A:
        def __next__(self):
            return self

        def __iter__(self):
            return self

    msg = r"Frame.rbind\(\) expects a list or sequence of Frames as an " \
          r"argument; instead item 0 was a <class '.*\.A'>"
    with pytest.raises(TypeError, match=msg):
        dt.rbind(A())
Ejemplo n.º 5
0
def test_rbind_all_stypes():
    from datetime import date as d
    sources = {
        dt.bool8: [True, False, True, None, False, None],
        dt.int8: [3, -5, None, 17, None, 99, -99],
        dt.int16: [None, 245, 872, -333, None],
        dt.int32: [10000, None, 1, 0, None, 34, -2222222],
        dt.int64: [None, 9348571093841, -394, 3053867, 111334],
        dt.float32: [None, 3.3, math.inf, -7.123e20, 34098.79],
        dt.float64: [math.inf, math.nan, 341.0, -34985.94872, 1e310],
        dt.str32: ["first", None, "third", "asblkhblierb", ""],
        dt.str64: ["red", "orange", "blue", "purple", "magenta", None],
        dt.obj64: [1, False, "yey", math.nan, (3, "foo"), None, 2.33],
        'date32': [d(2003, 11, 25), None,
                   d(2017, 5, 16),
                   d(2005, 6, 5)]
    }
    cat1 = [
        dt.bool8, dt.int8, dt.int16, dt.int32, dt.int64, dt.float32, dt.float64
    ]
    cat2 = [dt.str32, dt.str64]
    cat3 = ['date32']
    cat4 = [dt.obj64]
    all_stypes = list(sources.keys())
    for st1 in all_stypes:
        for st2 in all_stypes:
            compatible = \
                (st2 in cat1 or st2 in cat4) if st1 in cat1 else \
                (st2 in cat2 or st2 in cat4) if st1 in cat2 else \
                (st2 in cat3 or st2 in cat4) if st1 in cat3 else \
                True
            f1 = dt.Frame(sources[st1], stype=st1)
            f2 = dt.Frame(sources[st2], stype=st2)
            if compatible:
                f3 = dt.rbind(f1, f2)
                f1.rbind(f2)
                frame_integrity_check(f1)
                frame_integrity_check(f2)
                frame_integrity_check(f3)
                assert f1.nrows == len(sources[st1]) + len(sources[st2])
                assert f3.shape == f1.shape
                assert f1.to_list() == f3.to_list()
                del f1
                del f2
                del f3
            else:
                with pytest.raises(TypeError):
                    dt.rbind(f1, f2)
Ejemplo n.º 6
0
def test_rbind_strings_large():
    s = "ABCDEFGHIJ" * 110 + "xyz"
    n = 1000000
    assert len(s) * n * 2 > (1 << 31)
    DT0 = dt.Frame(A=[s] * n)
    DT1 = dt.rbind(DT0, DT0)
    assert DT1[-1, 0] == s
Ejemplo n.º 7
0
 def query(self, query, **params):
     """
     Executes the given SQL query against the connected dsatabase.
     """
     chunksize = params.pop("chunksize", 100000)
     to_pandas = params.pop("to_pandas", True)
     with self._cursor() as cursor:
         params = {
             k: v
             for k, v in params.items() if k in getargs(cursor.execute).args
         }
         cursor.execute(query, **params)
         fields = [i[0] for i in cursor.description]
         res = []
         while True:
             result = cursor.fetchmany(chunksize)
             if not result:
                 break
             res.append(Frame(result))
     frame = rbind(res, bynames=False)
     if frame.shape == (0, 0):
         frame = Frame({n: [] for n in fields})
     else:
         frame.names = fields
     if to_pandas:
         frame = frame.to_pandas()
     return frame
Ejemplo n.º 8
0
def test_rbind_str32_str64():
    DT1 = dt.Frame(A=list('abcd'), stype=dt.str32)
    DT2 = dt.Frame(A=list('efghij'), stype=dt.str64)
    DT3 = dt.Frame(A=list('klm'), stype=dt.str32)
    DTR = dt.rbind(DT1, DT2, DT3)
    # It would be better if the result was str32
    assert_equals(DTR, dt.Frame(A=list('abcdefghijklm'), stype=dt.str64))
Ejemplo n.º 9
0
def test_rbind_all_stypes():
    sources = {
        dt.bool8: [True, False, True, None, False, None],
        dt.int8: [3, -5, None, 17, None, 99, -99],
        dt.int16: [None, 245, 872, -333, None],
        dt.int32: [10000, None, 1, 0, None, 34, -2222222],
        dt.int64: [None, 9348571093841, -394, 3053867, 111334],
        dt.float32: [None, 3.3, math.inf, -7.123e20, 34098.79],
        dt.float64: [math.inf, math.nan, 341.0, -34985.94872, 1e310],
        dt.str32: ["first", None, "third", "asblkhblierb", ""],
        dt.str64: ["red", "orange", "blue", "purple", "magenta", None],
        dt.obj64: [1, False, "yey", math.nan, (3, "foo"), None, 2.33],
    }
    all_stypes = list(sources.keys())
    for st1 in all_stypes:
        for st2 in all_stypes:
            f1 = dt.Frame(sources[st1], stype=st1)
            f2 = dt.Frame(sources[st2], stype=st2)
            f3 = dt.rbind(f1, f2)
            f1.rbind(f2)
            frame_integrity_check(f1)
            frame_integrity_check(f2)
            frame_integrity_check(f3)
            assert f1.nrows == len(sources[st1]) + len(sources[st2])
            assert f3.shape == f1.shape
            assert f1.to_list() == f3.to_list()
            del f1
            del f2
            del f3
Ejemplo n.º 10
0
def test_debug_logger_object():
    assert dt.options.debug.logger is None
    logger = SimpleLogger()
    with dt.options.debug.context(logger=logger, enabled=True, report_args=True):
        assert dt.options.debug.logger is logger
        assert dt.options.debug.enabled is True

        DT = dt.rbind([])
        assert "datatable.rbind([]) {" in logger.msg
        assert re.search(r"} # \d+(?:\.\d+)?(?:[eE][+-]?\d+)? s", logger.msg)
        logger.msg = ""

        with pytest.raises(TypeError):
            dt.rbind(4)
        assert "datatable.rbind(4) {" in logger.msg
        assert re.search(r"} # \d+(?:\.\d+)?(?:[eE][+-]?\d+)? s \(failed\)", logger.msg)
Ejemplo n.º 11
0
def test_fread_from_glob(tempfile):
    base, ext = os.path.splitext(tempfile)
    if not ext:
        ext = ".csv"
    pattern = base + "*" + ext
    tempfiles = ["".join([base, str(i), ext]) for i in range(10)]
    try:
        for j in range(10):
            with open(tempfiles[j], "w") as f:
                f.write("A,B,C\n0,0,0\n%d,%d,%d\n"
                        % (j, j * 2 + 1, (j + 3) * 17 % 23))
        res = dt.fread(pattern)
        assert len(res) == 10
        assert set(res.keys()) == set(tempfiles)
        for f in res.values():
            assert isinstance(f, dt.Frame)
            frame_integrity_check(f)
            assert f.names == ("A", "B", "C")
            assert f.shape == (2, 3)
        df = dt.rbind(*[res[f] for f in tempfiles])
        frame_integrity_check(df)
        assert df.names == ("A", "B", "C")
        assert df.shape == (20, 3)
        assert df.to_list() == [
            [0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9],
            [0, 1, 0, 3, 0, 5, 0, 7, 0, 9, 0, 11, 0, 13, 0, 15, 0, 17, 0, 19],
            [0, 5, 0, 22, 0, 16, 0, 10, 0, 4, 0, 21, 0, 15, 0, 9, 0, 3, 0, 20]
        ]
    finally:
        for f in tempfiles:
            os.remove(f)
    def create_data():
        folder_path = '/home/ubuntu/data/Kaggle/IEEEFraud'  # Modify as needed

        train_identity_file = os.path.join(folder_path, 'train_identity.csv')
        test_identity_file = os.path.join(folder_path, 'test_identity.csv')
        train_transaction_file = os.path.join(folder_path,
                                              'train_transaction.csv')
        test_transaction_file = os.path.join(folder_path,
                                             'test_transaction.csv')
        if not (os.path.isfile(train_identity_file and os.path.isfile(
                test_identity_file
                and os.path.isfile(train_transaction_file and
                                   os.path.isfile(test_transaction_file))))):
            return []

        train_identity = dt.fread(train_identity_file)
        test_identity = dt.fread(test_identity_file)
        train_transaction = dt.fread(train_transaction_file)
        test_transaction = dt.fread(test_transaction_file)

        target = 'isFraud'
        train_identity.key = 'TransactionID'
        test_identity.key = 'TransactionID'

        # Join identity into transactions
        train = train_transaction[:, :, dt.join(train_identity)]
        test = test_transaction[:, :, dt.join(test_identity)]

        # Combine train and test for further processing
        X = dt.rbind([train, test], force=True)

        # Turn integer time column into datetime string with proper format
        startdate = datetime.datetime.strptime('2017-11-30', "%Y-%m-%d")
        pd_time = X[:, 'TransactionDT'].to_pandas()['TransactionDT'].apply(
            lambda x: (startdate + datetime.timedelta(seconds=x)))
        X[:, 'TransactionDT_str'] = dt.Frame(
            pd_time.apply(
                lambda x: datetime.datetime.strftime(x, "%Y-%m-%d %H:%M:%S")))
        # Month - to be used as fold column (that way get cross-validation without shuffling future/past too much, minimize overlap between folds)
        fold_column = 'fold_column'
        X[:, fold_column] = dt.Frame(pd_time.dt.month +
                                     (pd_time.dt.year - 2017) * 12)

        # Create start times (in secs) for Dx features (which are growing linearly over time)
        for i in range(1, 16):
            X[:, 'Trans_D%d_start' % i] = dt.Frame(
                np.floor(X[:, 'TransactionDT'].to_numpy().ravel() /
                         (24 * 60 * 60)) - X[:, 'D%d' % i].to_numpy().ravel())

        # re-order names
        first_names = [target, fold_column]
        names = first_names + [x for x in X.names if x not in first_names]
        X = X[:, names]

        # Split back into train and test
        train = X[:train_transaction.nrows, :]
        test = X[train_transaction.nrows:, :]
        return {'IEEE.train': train, 'IEEE.test': test}
Ejemplo n.º 13
0
def test_rbind_different_types_force():
    DT1 = dt.Frame(A=[1, 4, 77]),
    DT2 = dt.Frame(A=["Hi", "there", None])
    DT3 = dt.Frame(A=['2010-11-01', '2020-08-14', '2022-12-12'], type='date32')
    with pytest.raises(TypeError):
        dt.rbind(DT1, DT2)
    with pytest.raises(TypeError):
        dt.rbind(DT1, DT3)
    with pytest.raises(TypeError):
        dt.rbind(DT3, DT2)
    assert_equals(dt.rbind(DT1, DT2, force=True),
                  dt.Frame(A=["1", "4", "77", "Hi", "there", None]))
    assert_equals(
        dt.rbind(DT1, DT3, force=True),
        dt.Frame(A=["1", "4", "77", "2010-11-01", "2020-08-14", "2022-12-12"]))
    assert_equals(
        dt.rbind(DT2, DT3, force=True),
        dt.Frame(
            A=["Hi", "there", None, "2010-11-01", "2020-08-14", "2022-12-12"]))
Ejemplo n.º 14
0
def pd_dt_concat(frames, axis=0):
    """
    Concatenate sequence of datatable Frames or pandas DataFrames `frames` along `axis` (0 means rows, 1 means columns).
    """

    if USE_DT:
        if axis == 0:
            return dt.rbind(*frames)
        elif axis == 1:
            return dt.cbind(*frames)
        else:
            raise ValueError('invalid axis:', axis)
    else:
        return pd.concat(frames, axis=axis)
Ejemplo n.º 15
0
def test_groupby_large_random_integers(seed):
    random.seed(seed)
    ngrps1 = random.choice([1, 1, 2, 2, 2, 3, 4, 5])
    n0 = 1 << random.choice([1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6, 7])
    chunks = ([random.sample(range(n0), random.randint(1, n0))] +
              [random.sample([0] * 100 + list(range(256)), random.randint(1, 20))
               for i in range(ngrps1)])
    n = int(random.expovariate(0.0001)) + 10
    sample = [sum(random.choice(chunks[i]) << (8*i) for i in range(len(chunks)))
              for _ in range(n)]
    nuniques = len(set(sample))
    f0 = dt.Frame(sample)
    assert f0.nunique1() == nuniques
    f1 = dt.rbind(*([f0] * random.randint(2, 20)))
    assert f1.nunique1() == nuniques
    def create_data(X: dt.Frame = None):

        # to be usd with models.algorithms.logistic_regression.py with
        # _kaggle = True
        # _kaggle_features = True
        # _kaggle_mode = True
        train = dt.fread("/home/jon/kaggle/cat/inputs/train.csv.zip")
        train_orig = dt.Frame(train)
        train['sample_weight'] = dt.Frame(np.array([1.0] * train.shape[0]))
        test = dt.fread("/home/jon/kaggle/cat/inputs/test.csv.zip")
        test_orig = dt.Frame(test)
        test['sample_weight'] = dt.Frame(np.array([0.0] * test.shape[0]))
        test['target'] = dt.Frame(np.array([0.0] * test.shape[0]))
        final = dt.rbind([train, test])

        return {
            'catmerged': final,
            'cattrain': train_orig,
            'cattest': test_orig
        }
Ejemplo n.º 17
0
    def create_data(X: dt.Frame = None):
        # to be used with models.algorithms.logistic_regression.py with
        # _kaggle = True
        # _kaggle_features = True
        # _kaggle_mode = True
        path = "/home/jon/kaggle/cat/inputs/"
        if not os.path.exists(path):
            return []

        train = dt.fread(os.path.join(path, "train.csv.zip"))
        train_orig = dt.Frame(train)
        train['sample_weight'] = dt.Frame(np.array([1.0] * train.shape[0]))
        test = dt.fread(os.path.join(path, "test.csv.zip"))
        test_orig = dt.Frame(test)
        test['sample_weight'] = dt.Frame(np.array([0.0] * test.shape[0]))
        test['target'] = dt.Frame(np.array([0] * test.shape[0], dtype=int))
        final = dt.rbind([train, test])

        return {
            'catmerged': final,
            'cattrain': train_orig,
            'cattest': test_orig
        }
Ejemplo n.º 18
0
def test_fread_from_glob(tempfile):
    base, ext = os.path.splitext(tempfile)
    if not ext:
        ext = ".csv"
    pattern = base + "*" + ext
    tempfiles = ["".join([base, str(i), ext]) for i in range(10)]
    try:
        for j in range(10):
            with open(tempfiles[j], "w") as f:
                f.write("A,B,C\n0,0,0\n%d,%d,%d\n"
                        % (j, j * 2 + 1, (j + 3) * 17 % 23))
        res = dt.iread(pattern)
        assert res.__class__.__name__ == "read_iterator"
        res = list(res)
        assert len(res) == 10
        assert set(DTj.source for DTj in res) == set(tempfiles)
        # The glob pattern tempfile*.csv may have returned the files in a
        # shuffled order, need to sort them back from 0 to 9:
        res = sorted(res, key=lambda DTj: DTj.source)
        for j in range(10):
            DTj = res[j]
            assert isinstance(DTj, dt.Frame)
            frame_integrity_check(DTj)
            assert DTj.names == ("A", "B", "C")
            assert DTj.shape == (2, 3)
        df = dt.rbind(res)
        frame_integrity_check(df)
        assert df.names == ("A", "B", "C")
        assert df.shape == (20, 3)
        assert df.to_list() == [
            [0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9],
            [0, 1, 0, 3, 0, 5, 0, 7, 0, 9, 0, 11, 0, 13, 0, 15, 0, 17, 0, 19],
            [0, 5, 0, 22, 0, 16, 0, 10, 0, 4, 0, 21, 0, 15, 0, 9, 0, 3, 0, 20]
        ]
    finally:
        for f in tempfiles:
            os.remove(f)
Ejemplo n.º 19
0
def load_table(name, data_dir):
    """
    Load all PSet tables with name into a datatable, dropping any duplicate rows.

    @param name: [`string`] The name of the table
    @param data_dir: [`string`] File path to the directory with all PSet tables
    @return: [`datatable.Frame`] A datatable containing all rows from all PSets
    """
    # Get all files
    files = glob.glob(os.path.join(data_dir, '**', f'*{name}.csv'))
    # Filter so that file path are '{data_dir}/{pset}/{pset}_{name}.csv'
    files = [
        file_name for file_name in files
        if re.search(data_dir + r'/(\w+)/\1_' + name + '.csv$', file_name)
    ]
    # Read and concatenate tables
    df = rbind(*iread(files, sep=','))
    # Replace any empty strings with None/NA
    df.replace("", None)
    # Drop duplicates
    # (groups by all columns and selects only the first row from each group)
    df = df[0, :, by(df.names)]

    return df
Ejemplo n.º 20
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config
        import bz2

        def extract_bz2(file, output_file):
            zipfile = bz2.BZ2File(file)
            data = zipfile.read()
            open(output_file, 'wb').write(data)

        temp_path = os.path.join(user_dir(), "recipe_tmp", "airlines")
        os.makedirs(temp_path, exist_ok=True)
        dt.options.nthreads = 8

        # specify which years are used for training and testing
        training = list(range(2005, 2008))
        testing = [2008]

        # download and unzip files
        files = []
        for f in ["%d.csv.bz2" % year for year in training + testing]:
            link = AirlinesData.base_url + "%s" % f
            file = download(link, dest_path=temp_path)
            output_file = file.replace(".bz2", "")
            if not os.path.exists(output_file):
                extract_bz2(file, output_file)
            files.append(output_file)

        # parse with datatable
        X = dt.rbind(*[dt.fread(x) for x in files])

        # add date
        date_col = 'Date'
        X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[
            'DayofMonth']
        cols_to_keep = ['Date']

        # add number of flights in/out for each airport per given interval
        timeslice_mins = 60
        for name, new_col, col, group in [
            ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
            ("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
        ]:
            X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
            group_cols = [date_col, group, new_col]
            new_name = 'flights_%s_per_%d_min' % (name, timeslice_mins)
            flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
            flights.key = group_cols
            cols_to_keep.append(new_name)
            X = X[:, :, dt.join(flights)]

        # select flights leaving from SFO only
        X = X[dt.f['Origin'] == 'SFO', :]

        # Fill NaNs in DepDelay column
        X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0

        # create binary target column
        depdelay_threshold_mins = 15
        target = 'DepDelay%dm' % depdelay_threshold_mins
        X[:, target] = dt.f['DepDelay'] > depdelay_threshold_mins
        cols_to_keep.extend([
            target,
            'Year',
            'Month',
            'DayofMonth',
            'DayOfWeek',
            'CRSDepTime',
            'UniqueCarrier',
            'FlightNum',
            'TailNum',
            'CRSElapsedTime',
            'Origin',
            'Dest',
            'Distance',
            # Leaks for delay
            # 'DepTime',
            # 'ArrTime', #'CRSArrTime',
            # 'ActualElapsedTime',
            # 'AirTime', #'ArrDelay', #'DepDelay',
            # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay',
            # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay',
        ])
        X = X[:, cols_to_keep]

        # Join in some extra info
        join_files = [('UniqueCarrier', 'carriers.csv', 'Code'),
                      ('Origin', 'airports.csv', 'iata'),
                      ('Dest', 'airports.csv', 'iata'),
                      ('TailNum', 'plane-data.csv', 'tailnum')]

        for join_key, file, col in join_files:
            file = download(
                'https://0xdata-public.s3.amazonaws.com/data_recipes_data/%s' %
                file,
                dest_path=temp_path)
            X_join = dt.fread(file, fill=True)
            X_join.names = {col: join_key}
            X_join.names = [join_key] + [
                join_key + "_" + x for x in X_join.names if x != join_key
            ]
            X_join.key = join_key
            X = X[:, :, dt.join(X_join)]
            del X[:, join_key]

        split = True
        if not split:
            filename = os.path.join(
                temp_path, "flight_delays_data_recipe_%d-%d.csv" %
                (min(training), max(testing)))
            X.to_csv(filename)
            return filename
        else:
            # prepare splits (by year) and create binary .jay files for import into Driverless AI
            output_files = []
            for condition, name in [
                ((min(training) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(training)), 'training'),
                ((min(testing) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(testing)), 'test'),
            ]:
                X_split = X[condition, :]
                filename = os.path.join(
                    temp_path, "augmented_flights_%s-%d_%s.csv" %
                    (X_split[:, 'Year'].min1(), X_split[:,
                                                        'Year'].max1(), name))
                X_split.to_csv(filename)
                output_files.append(filename)
            return output_files
Ejemplo n.º 21
0
def test_rbind_modulefn():
    f0 = dt.Frame([1, 5409, 204])
    f1 = dt.Frame([109813, None, 9385])
    f3 = dt.rbind(f0, f1)
    f3.internal.check()
    assert f3.topython()[0] == f0.topython()[0] + f1.topython()[0]
Ejemplo n.º 22
0
def test_rbind_modulefn():
    f0 = dt.Frame([1, 5409, 204])
    f1 = dt.Frame([109813, None, 9385])
    f3 = dt.rbind(f0, f1)
    frame_integrity_check(f3)
    assert f3.to_list()[0] == f0.to_list()[0] + f1.to_list()[0]
}
# pset_tables: ["dose_response", "drug", "datasets_cells",
#     "dataset_statistics", "cell", "drug_annotation", "gene_drug",
#     "profile", "dataset", "mol_cell", "gene_annotation", "dataset_cell",
#     "experiment", "tissue", "gene"]'

pset_name = psets[3]  # GDSC_v1

# -- Read in a single .csv
experiment = fread(
    os.path.join(data_dir, pset_name, pset_tables[pset_name][-3],
                 f'*{pset_tables[pset_name][-3]}*.csv'))

# -- Read in multiple .csv files and make a single Frame
dose_response = rbind(*iread(
    os.path.join(data_dir, pset_name, pset_tables[pset_name][0],
                 f'*{pset_tables[pset_name][0]}*.csv')))

# Can use pattern matching to read in multiple files; ** will match any number of subdirectories
# Should make path parsing code much more compact
all_cell_tables = rbind(
    *iread(os.path.join(data_dir, '**', 'cell', '*cell.csv')))

# -- Write to csv
dose_response.to_csv(
    os.path.join(output_dir, f'{pset_tables[pset_name][0]}.csv'))

# -- Select (of the form df[filter, select, ...])
# f is for Frame and references variables within the Frame object (i.e., columns)
dose_response[:, [f.id, f.experiment_id]]
Ejemplo n.º 24
0
def test_rbind_void():
    DT1 = dt.Frame([None] * 10)
    DT2 = dt.Frame([None] * 3)
    res = dt.rbind(DT1, DT2)
    assert res.types == [dt.Type.void]
Ejemplo n.º 25
0
def test_not_inplace():
    dt0 = dt.Frame({"A": [5, 1], "B": [4, 4]})
    dt1 = dt.Frame({"A": [22], "B": [11]})
    dtr = dt.rbind(dt0, dt1)
    assert_equals(dtr, dt.Frame({"A": [5, 1, 22], "B": [4, 4, 11]}))
    assert_equals(dt0, dt.Frame({"A": [5, 1], "B": [4, 4]}))
Ejemplo n.º 26
0
def test_issue2621_a():
    # Rbinding an iterator of frames should produce correct result
    RES = dt.rbind(dt.Frame(A=[i], B=['hey']) for i in range(10))
    assert_equals(RES, dt.Frame(A=range(10), B=['hey'] * 10))
Ejemplo n.º 27
0
def test_issue2621_b():
    src = """c1, c2, c3
             11, 2, 3"""
    RES = dt.rbind(dt.iread([src, src]))
    assert_equals(RES, dt.Frame(c1=[11, 11], c2=[2, 2], c3=[3, 3]))
# counts by target groups
g = X[:, {"count": count()}, by(target_col)]
if not g.shape[1] == 2:
    raise ValueError(
        "Not a binary target - target column must contain exactly 2 values.")

# find sizes and target values for minority and majority class partitions
n_minority = g[:, min(f.count)][0, 0]
n_majority = g[:, max(f.count)][0, 0]
target_minority = g[f.count == n_minority, target_col][0, 0]
target_majority = g[f.count == n_majority, target_col][0, 0]

# validate that times indeed downsamples majority class
if times * n_minority >= n_majority:
    raise ValueError(
        "Downsampling coefficient `times` is too large: downsampled dataset results in inflated majority class."
    )

# downsample with pandas frame
df_majority = X[f[target_col] == target_majority, :].to_pandas()
df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=n_minority * times,
                                   random_state=random_seed)

return {
    new_dataset_name:
    rbind(X[f[target_col] == target_minority, :],
          dt.Frame(df_majority_downsampled))
}
Ejemplo n.º 29
0
def test_bool8_small_view():
    DT0 = dt.Frame([True, False, False, True, None, True, True, None])
    DT1 = dt.Frame([None, None, False, False, True, True, True, True])
    DTS = dt.rbind(DT0[::2, :], DT0[1::2, :]).sort(0)
    assert_equals(DTS, DT1)
Ejemplo n.º 30
0
})[:, {
    'frequency': count(),
    'url': None,
    'status': 'not.found',
    'action': None
},
   by(f.hpo)]

# set url for HPO code if it exists
hpoCodes['url'] = dt.Frame([
    f'http://purl.obolibrary.org/obo/{d}' for d in hpoCodes['hpo'].to_list()[0]
])

# manually check each link and search for code. Either add the code or follow
# up with SolveRD project data coordinators.
rbind(fread('data/unknown_hpo_codes.csv'),
      hpoCodes).to_csv('data/unknown_hpo_codes.csv')

# ~ 3c ~
# Investivate 'unavailable' subjects (i.e., subjects that do not exist in
# the current freeze). If subjects were added to 'unavailble', pull
# subjectIDs from another freeze(s) to see these subjects exist in another freeze
#
otherFreezeIDs = rd3tools.flatten_attr(
    rd3.get('rd3_freeze1_subject', attributes='id', batch_size=10000), 'id')
novelOmicsIDs = rd3tools.flatten_attr(
    rd3.get('rd3_novelomics_subject', attributes='id', batch_size=10000), 'id')

unknownSubjects = dt.Frame(unavailable,
                           types={
                               'id': str,
                               'dateofBirth': str,