Exemple #1
0
    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        # Download files
        # Location in DAI file system where we will save the data set
        temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)

        # URL of desired data, this comes from the City of Seattle
        link_basics = "https://datasets.imdbws.com/title.basics.tsv.gz"
        link_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"
        link_episodes = "https://datasets.imdbws.com/title.episode.tsv.gz"

        # Download the files
        file_basics = download(link_basics, dest_path=temp_path)
        file_ratings = download(link_ratings, dest_path=temp_path)
        file_episodes = download(link_episodes, dest_path=temp_path)

        # get COVID19 new cases data from Our World in Data github
        basics = dt.fread(file_basics, fill=True)
        ratings = dt.fread(file_ratings, fill=True)
        episodes = dt.fread(file_episodes, na_strings=['\\N'], fill=True)

        # remove files
        os.remove(file_basics)
        os.remove(file_ratings)
        os.remove(file_episodes)

        # Create Title with Ratings dataset
        # join titles with non-null ratings
        ratings = ratings[~dt.isna(dt.f.averageRating), :]
        ratings.key = "tconst"
        basics_ratings = basics[:, :, dt.join(ratings)]

        # Create Episodes dataset
        episodes = episodes[~dt.isna(dt.f.seasonNumber) & ~dt.isna(dt.f.episodeNumber), :]
        episode_ratings = episodes[:, :, dt.join(ratings)]
        episode_ratings.names = {'tconst': 'episodeTconst', 'parentTconst': 'tconst', 'averageRating': 'episodeAverageRating', 'numVotes': 'episodeNumVotes'}
        basics_ratings.key = 'tconst'
        title_episode_ratings = episode_ratings[:, :, dt.join(basics_ratings)]

        # enumerate series episodes from 1 to N
        title_episode_ratings = title_episode_ratings[:, :, dt.sort(dt.f.tconst, dt.f.seasonNumber, dt.f.episodeNumber)]
        result = title_episode_ratings[:, dt.count(), dt.by(dt.f.tconst)][:, 'count'].to_list()
        from itertools import chain
        cumcount = chain.from_iterable([i + 1 for i in range(n)] for n in result[0])
        title_episode_ratings['episodeSequence'] = dt.Frame(tuple(cumcount))

        # return datasets
        return {f"imdb_title_ratings": basics_ratings,
                f"imdb_episode_ratings": title_episode_ratings}
    def create_data():
        folder_path = '/home/ubuntu/data/Kaggle/IEEEFraud'  # Modify as needed

        train_identity_file = os.path.join(folder_path, 'train_identity.csv')
        test_identity_file = os.path.join(folder_path, 'test_identity.csv')
        train_transaction_file = os.path.join(folder_path,
                                              'train_transaction.csv')
        test_transaction_file = os.path.join(folder_path,
                                             'test_transaction.csv')
        if not (os.path.isfile(train_identity_file and os.path.isfile(
                test_identity_file
                and os.path.isfile(train_transaction_file and
                                   os.path.isfile(test_transaction_file))))):
            return []

        train_identity = dt.fread(train_identity_file)
        test_identity = dt.fread(test_identity_file)
        train_transaction = dt.fread(train_transaction_file)
        test_transaction = dt.fread(test_transaction_file)

        target = 'isFraud'
        train_identity.key = 'TransactionID'
        test_identity.key = 'TransactionID'

        # Join identity into transactions
        train = train_transaction[:, :, dt.join(train_identity)]
        test = test_transaction[:, :, dt.join(test_identity)]

        # Combine train and test for further processing
        X = dt.rbind([train, test], force=True)

        # Turn integer time column into datetime string with proper format
        startdate = datetime.datetime.strptime('2017-11-30', "%Y-%m-%d")
        pd_time = X[:, 'TransactionDT'].to_pandas()['TransactionDT'].apply(
            lambda x: (startdate + datetime.timedelta(seconds=x)))
        X[:, 'TransactionDT_str'] = dt.Frame(
            pd_time.apply(
                lambda x: datetime.datetime.strftime(x, "%Y-%m-%d %H:%M:%S")))
        # Month - to be used as fold column (that way get cross-validation without shuffling future/past too much, minimize overlap between folds)
        fold_column = 'fold_column'
        X[:, fold_column] = dt.Frame(pd_time.dt.month +
                                     (pd_time.dt.year - 2017) * 12)

        # Create start times (in secs) for Dx features (which are growing linearly over time)
        for i in range(1, 16):
            X[:, 'Trans_D%d_start' % i] = dt.Frame(
                np.floor(X[:, 'TransactionDT'].to_numpy().ravel() /
                         (24 * 60 * 60)) - X[:, 'D%d' % i].to_numpy().ravel())

        # re-order names
        first_names = [target, fold_column]
        names = first_names + [x for x in X.names if x not in first_names]
        X = X[:, names]

        # Split back into train and test
        train = X[:train_transaction.nrows, :]
        test = X[train_transaction.nrows:, :]
        return {'IEEE.train': train, 'IEEE.test': test}
Exemple #3
0
def analyzeDailyAndMeldeTag(fullTable, fromDay, toDay, byCriteria,
                            criteriaValue, filter, postfix):
    # print("fromDay, toDay",fromDay, toDay)
    # print("byCriteria, criteriaValue",byCriteria, criteriaValue)
    # print("filter:", filter)

    fullfilter = filter & filterByDayAndCriteria(
        fromDay, toDay, (byCriteria == criteriaValue), "DatenstandTag")
    #print("fullfilter:", fullfilter)
    dayTable = analyzeDaily(fullTable, fullfilter, "", postfix,
                            "DatenstandTag")

    maxDatenstandTag = fullTable[:, dt.f.DatenstandTag].max().to_list()[0][0]
    print("maxDatenstandTag", maxDatenstandTag)

    latestTable = fullTable[dt.f.DatenstandTag == maxDatenstandTag, :]
    olderTable = fullTable[dt.f.DatenstandTag == maxDatenstandTag - 7, :]
    #latestTable.materialize()
    #print("latestTable",latestTable)

    #print(latestTable)
    minMeldeTag = latestTable[:, dt.f.MeldeTag].min().to_list()[0][0]
    maxMeldeTag = latestTable[:, dt.f.MeldeTag].max().to_list()[0][0]
    #print("minMeldeTag,maxMeldeTag",minMeldeTag,maxMeldeTag)

    fullfilter = filter & filterByDayAndCriteria(minMeldeTag, maxMeldeTag + 1,
                                                 (byCriteria == criteriaValue),
                                                 "MeldeTag")
    #print("fullfilter2:", fullfilter)

    meldeTable = analyzeDaily(latestTable, fullfilter, "MeldeTag_", postfix,
                              "MeldeTag")
    meldeTable.names = {"MeldeTag": "DatenstandTag"}
    meldeTable.key = "DatenstandTag"

    meldeTable7TageAlt = analyzeDaily(olderTable, fullfilter,
                                      "MeldeTag_Vor7Tagen_", postfix,
                                      "MeldeTag")
    meldeTable7TageAlt.names = {"MeldeTag": "DatenstandTag"}
    meldeTable7TageAlt.key = "DatenstandTag"

    dayTable.key = "DatenstandTag"

    meldeDays = set(meldeTable[:, "DatenstandTag"].to_list()[0])
    meldeDays7old = set(meldeTable7TageAlt[:, "DatenstandTag"].to_list()[0])
    dataDays = set(dayTable[:, "DatenstandTag"].to_list()[0])
    allDays = sorted(list(meldeDays.union(dataDays).union(meldeDays7old)))

    allDaysTable = dt.Frame(allDays)
    allDaysTable.names = ["DatenstandTag"]
    allDaysTable.key = "DatenstandTag"

    #dayTable = dayTable[:, :, dt.join(meldeTable)]
    allDaysTable = allDaysTable[:, :, dt.join(
        meldeTable)][:, :, dt.join(meldeTable7TageAlt)][:, :,
                                                        dt.join(dayTable)]
    allDaysTable.key = "DatenstandTag"
    return allDaysTable
Exemple #4
0
def test_join_errors():
    d0 = dt.Frame(A=[1, 2, 3])
    d1 = dt.Frame(B=range(10), stype=dt.float64)
    with pytest.raises(ValueError) as e:
        d0[:, :, join(d1)]
    assert "The join frame is not keyed" in str(e.value)
    d1.key = "B"
    with pytest.raises(ValueError) as e:
        d0[:, :, join(d1)]
    assert "Key column `B` does not exist in the left Frame" in str(e.value)
    d1.names = ("A", )
    with pytest.raises(TypeError) as e:
        d0[:, :, join(d1)]
    assert ("Join column `A` has type int in the left Frame, and type real "
            "in the right Frame" in str(e.value))
Exemple #5
0
def map_foreign_key_to_table(primary_df: dt.Frame, fk_df: dt.Frame,
                             join_column_dict: dict) -> dt.Frame:
    """
    Performs a left join of `primary_df` to `fk_df` by refence, updating
    the column indicated in `join_column_dict`.

    :primary_df: A `datatable.Frame`. This should be the larger table
        and will ideally be loaded from a .jay file with a `memory_limit`
        specified in `datable.fread`.
    :fk_df: A `datatable.Frame`. This should be a smaller table
        which will be joined to 
    :join_column_dict: A dictionary with keys 'primary_df' and 'fk_df'
        specifying the columns to join the tables on.
    """
    # Check for correct keys in dict
    key_strings = list(join_column_dict.keys())
    if ('primary_df' not in key_strings or 'fk_df' not in key_strings):
        raise ValueError("The join_column_dict item must have keys"
                         "'primary_df' and 'fk_df'!")
    # Rename columns
    primary_col = join_column_dict['primary_df']
    fk_col = join_column_dict['fk_df']
    fk_df.names = {fk_col: primary_col}
    fk_df.key = primary_col
    update_expr = {primary_col: g.id}
    # Join, update by reference then coerce to the correct type
    primary_df[:, update(**update_expr), join(fk_df)]
Exemple #6
0
def test_join_random(seed, lt):
    random.seed(seed)
    ndata = int(random.expovariate(0.0005))
    nkeys = int(random.expovariate(0.01)) + 1
    st = random.choice(lt.stypes)
    if lt == ltype.bool:
        keys = [True, False]
    elif lt == ltype.int:
        nbits = (6 if st == stype.int8 else 12 if st == stype.int16 else 24)
        keys = list(set(random.getrandbits(nbits) for _ in range(nkeys)))
    elif lt == ltype.real:
        keys = [random.random() for _ in range(nkeys)]
        if st == stype.float32:
            keys = list(set(dt.Frame(keys, stype=st).topython()[0]))
        else:
            keys = list(set(keys))
    else:
        l = int(random.expovariate(0.05)) + 1
        keys = list(set(random_string(l) for _ in range(nkeys)))
    nkeys = len(keys)

    dkey = dt.Frame(KEY=keys, VAL=range(nkeys), stypes={"KEY": st})
    dkey.key = "KEY"
    keys, vals = dkey.topython()
    main = [random.choice(keys) for i in range(ndata)]
    dmain = dt.Frame(KEY=main, stype=st)
    res = [vals[keys.index(main[i])] for i in range(ndata)]

    djoined = dmain[:, :, join(dkey)]
    djoined.internal.check()
    assert djoined.shape == (ndata, 2)
    assert djoined.names == ("KEY", "VAL")
    assert djoined.topython() == [main, res]
Exemple #7
0
def test_join_empty_frame():
    # See issue #1988
    DT1 = dt.Frame(A=range(5), B=['gs', 'dfk', None, 'ava;lej', 'fdsfal;k'])
    DT2 = dt.Frame(A=[])
    DT2.key = "A"
    RES = DT1[:, :, dt.join(DT2)]
    assert_equals(RES, DT1)
Exemple #8
0
def analyzeDailyAltersgruppenGeschlechter(fullTable, fromDay, toDay,
                                          byCriteria, criteriaValue,
                                          Altersgruppen, Geschlechter):
    byDayTable = analyzeDailyAndMeldeTag(fullTable, fromDay, toDay, byCriteria,
                                         criteriaValue, True, "")
    byDayTable = analyzeDailyAltersgruppen(fullTable, byDayTable, fromDay,
                                           toDay, byCriteria, criteriaValue,
                                           True, Altersgruppen, Geschlechter,
                                           "")
    #byDayTable = byDayTable[:, :, dt.join(byDayTableAG)]
    #return byDayTable
    #print("byDayTable 1", byDayTable.names)
    for g in Geschlechter:
        if g != "unbekannt":
            print("Analyzing Geschlechter " + g)
            byDayTableG = analyzeDailyAndMeldeTag(fullTable, fromDay, toDay,
                                                  byCriteria, criteriaValue,
                                                  (dt.f.Geschlecht == g),
                                                  "_G_" + g)
            print("byDayTableG", byDayTableG.names)
            byDayTable = byDayTable[:, :, dt.join(byDayTableG)]
            print("byDayTable 2", byDayTable.names)
            byDayTable = analyzeDailyAltersgruppen(fullTable, byDayTable,
                                                   fromDay, toDay, byCriteria,
                                                   criteriaValue,
                                                   (dt.f.Geschlecht == g),
                                                   Altersgruppen, Geschlechter,
                                                   "_G_" + g)
            #print("byDayTableAG", byDayTableAG.names)
            #byDayTable = byDayTable[:,:,dt.join(byDayTableAG)]
            print("byDayTable 3", byDayTable.names)

    return byDayTable
Exemple #9
0
def test_select_from_joined():
    # Test that selecting unmatched elements in the joined frame does not
    # lead to a crash. Selection should be done using the "fast" DT[i, j]
    # syntax, where both i and j are integers.
    # See issue #1917
    JDT = dt.Frame(A=[0],
                   B=[True],
                   C1=[34],
                   C2=[17],
                   C3=[18],
                   C4=[20],
                   D1=[5.2],
                   D2=[-7.7],
                   E1=["foo"],
                   E2=["bar"],
                   stypes={
                       "A": dt.int32,
                       "B": dt.bool8,
                       "C1": dt.int8,
                       "C2": dt.int16,
                       "C3": dt.int32,
                       "C4": dt.int64,
                       "D1": dt.float32,
                       "D2": dt.float64,
                       "E1": dt.str32,
                       "E2": dt.str64
                   })
    JDT.key = "A"
    SRC = dt.Frame(A=[1, 3, 7], stype=dt.int32)
    DT = SRC[:, :, join(JDT)]
    for i in range(3):
        for j in range(1, DT.ncols):
            assert DT[i, j] is None
Exemple #10
0
    def __call__(self,
                 rows=None,
                 select=None,
                 verbose=False,
                 timeit=False,
                 groupby=None,
                 join=None,
                 sort=None,
                 engine=None):
        """DEPRECATED, use DT[i, j, ...] instead."""
        warnings.warn(
            "`DT(rows, select, ...)` is deprecated and will be removed in "
            "version 0.9.0. Please use `DT[i, j, ...]` instead",
            category=FutureWarning)
        time0 = time.time() if timeit else 0
        function = type(lambda: None)
        if isinstance(rows, function):
            rows = rows(datatable.f)
        if isinstance(select, function):
            select = select(datatable.f)

        res = self[rows, select,
                   datatable.join(join),
                   datatable.by(groupby),
                   datatable.sort(sort)]
        if timeit:
            print("Time taken: %d ms" % (1000 * (time.time() - time0)))
        return res
Exemple #11
0
    def transform(self, X: dt.Frame):
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        try:
            X = dt.Frame(X)
            original_zip_column_name = X.names[0]
            X.names = ['zip_key']
            X = X[:, str('zip_key')]
            zip_list = dt.unique(X[~dt.isna(dt.f.zip_key), 0]).to_list()[0]
            zip_features = [self.get_zipcode_features(x) for x in zip_list]
            X_g = dt.Frame({"zip_key": zip_list})
            X_g.cbind(dt.Frame(zip_features))
            X_g.key = 'zip_key'
            X_result = X[:, :, dt.join(X_g)]
            self._output_feature_names = [
                "{}.{}".format(original_zip_column_name, f)
                for f in list(X_result[:, 1:].names)
            ]
            self._feature_desc = [
                "Property '{}' of US zipcode found in '{}'".format(
                    f, original_zip_column_name)
                for f in list(X_result[:, 1:].names)
            ]
            return X_result[:, 1:]
        except Exception as ex:
            loggerwarning(
                logger, "USZipcodeDatabaseTransformer got exception {}".format(
                    type(ex).__name__))
            return np.zeros(X.shape[0])
Exemple #12
0
def test_join_missing_levels():
    d0 = dt.Frame(A=[1, 2, 3])
    d1 = dt.Frame(A=[1, 2], K=[True, False])
    d1.key = "A"
    res = d0[:, :, join(d1)]
    res.internal.check()
    assert res.topython() == [[1, 2, 3], [True, False, None]]
Exemple #13
0
def test_join_error_no_left_column():
    d0 = dt.Frame(A=[1, 2, 3])
    d1 = dt.Frame(B=range(10))
    d1.key = "B"
    with pytest.raises(ValueError) as e:
        noop(d0[:, :, join(d1)])
    assert "Key column `B` does not exist in the left Frame" in str(e.value)
Exemple #14
0
def test_join_missing_levels():
    d0 = dt.Frame(A=[1, 2, 3])
    d1 = dt.Frame(A=[1, 2], K=[True, False])
    d1.key = "A"
    res = d0[:, :, join(d1)]
    frame_integrity_check(res)
    assert res.to_list() == [[1, 2, 3], [True, False, None]]
Exemple #15
0
def test_join_error_type_mismatch():
    d0 = dt.Frame(A=[1, 2, 3])
    d1 = dt.Frame(A=[str(x) for x in range(10)])
    d1.key = "A"
    with pytest.raises(TypeError) as e:
        noop(d0[:, :, join(d1)])
    assert ("Column `A` of type int32 in the left Frame cannot be joined to "
            "column `A` of incompatible type str32 in the right Frame"
            in str(e.value))
Exemple #16
0
def test_write_joined_frame():
    # The joined frame will have a rowindex with some rows missing (-1).
    # Check that such frame can be written correctly. See issue #1919.
    DT1 = dt.Frame(A=range(5), B=list('ABCDE'))
    DT1.key = "A"
    DT2 = dt.Frame(A=[3, 7, 11, -2, 0, 1])
    DT = DT2[:, :, dt.join(DT1)]
    out = DT.to_csv()
    assert out == 'A,B\n3,D\n7,\n11,\n-2,\n0,A\n1,B\n'
Exemple #17
0
def test_issue1556():
    X = dt.Frame(A=['Ahoy ye matey!', 'hey'])
    J = dt.Frame(A=['hey'], B=['Avast'])
    J.key = 'A'
    R = X[:, :, join(J)]
    frame_integrity_check(R)
    assert R.shape == (2, 2)
    assert R.to_dict() == {"A": ["Ahoy ye matey!", "hey"],
                           "B": [None, "Avast"]}
Exemple #18
0
def test_issue1800():
    X1 = dt.Frame(A=range(5), B=[0.1, 0.2, 0.3, 0.4, 0.5])
    X1.key = "A"
    X2 = dt.Frame(A=[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5])
    joined = X2[:, :, dt.join(X1)]
    idx = dt.Frame([True] * X2.nrows)
    X2[idx, "N"] = joined[idx, "B"]
    assert X2.to_dict() == {"A": [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
                            "N": [0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4,
                                  0.5, 0.5, None, None]}
Exemple #19
0
def test_join_view():
    # See issue #1540
    x = dt.Frame(A=[1, 2, 3, 1, 2, 3], B=[3, 6, 2, 4, 3, 1], C=list("bdbbdb"))
    a = x[f.A == 1, ['A', 'B', 'C']]
    r = dt.Frame(C=['b', 'z'], BB=[2, 1000])
    r.key = 'C'
    res = a[:, :, join(r)]
    assert res.shape == (2, 4)
    assert res.names == ("A", "B", "C", "BB")
    assert res.to_list() == [[1, 1], [3, 4], ['b', 'b'], [2, 2]]
Exemple #20
0
def test_join_void_to_void():
    DT1 = dt.Frame(A=[None, None, None], B=[3, 4, 7])
    DT2 = dt.Frame(A=[None], V=["nothing"])
    DT2.key = "A"
    RES = DT1[:, :, join(DT2)]
    assert_equals(
        RES,
        dt.Frame(A=[None, None, None],
                 B=[3, 4, 7],
                 V=["nothing", "nothing", "nothing"]))
Exemple #21
0
def test_join_strings():
    d0 = dt.Frame([[1, 3, 2, 1, 1, 2, 0], list("cabdabb")], names=("A", "B"))
    d1 = dt.Frame([list("abcd"), range(0, 20, 5)], names=("B", "V"))
    d1.key = "B"
    res = d0[:, :, join(d1)]
    res.internal.check()
    assert res.shape == (7, 3)
    assert res.names == ("A", "B", "V")
    assert res.topython() == [[1, 3, 2, 1, 1, 2, 0],
                              ["c", "a", "b", "d", "a", "b", "b"],
                              [10, 0, 5, 15, 0, 5, 5]]
Exemple #22
0
    def join_self(self):
        ncols = self.ncols
        if self.nkeys:
            self.df = self.df[:, :, join(self.df)]

            s = slice(self.nkeys, ncols)
            join_data = copy.deepcopy(self.data[s])
            join_types = self.types[s].copy()
            join_names = self.names[s].copy()

            self.data += join_data
            self.types += join_types
            self.names += join_names
            self.nkeys = 0
            self.dedup_names()

        else:
            msg = "The join frame is not keyed"
            with pytest.raises(ValueError, match=msg):
                self.df = self.df[:, :, join(self.df)]
Exemple #23
0
def test_html_repr_joined_frame():
    L_dt = dt.Frame([[5, 6, 7, 9], [7, 8, 9, 10]], names=["A", "B"])
    R_dt = dt.Frame([[5, 7], [7, 9], [1, 2]], names=["A", "B", "yhat"])
    R_dt.key = ["A", "B"]
    DT = L_dt[:, :, dt.join(R_dt)]
    html = DT._repr_html_()
    hr = parse_html_repr(html)
    assert hr.names == ("A", "B", "yhat")
    assert hr.shape == (4, 3)
    assert hr.data == [['5', '7', '1'], ['6', '8', None], ['7', '9', '2'],
                       ['9', '10', None]]
Exemple #24
0
def test_join_update():
    d0 = dt.Frame([[1, 2, 3, 2, 3, 1, 3, 2, 2, 1], range(10)], names=("A", "B"))
    d1 = d0[:, mean(f.B), f.A]
    d1.key = "A"
    d0[:, "AA", join(d1)] = g.V0
    assert d0.names == ("A", "B", "AA")
    a = 4.75
    b = 14.0 / 3
    assert d0.to_list() == [[1, 2, 3, 2, 3, 1, 3, 2, 2, 1],
                            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                            [b, a, 4, a, 4, b, 4, a, a, b]]
Exemple #25
0
def test_join_and_select_g_col():
    # Check that selecting a g-column does not confuse it with an f-column.
    # See issue #1352
    F = dt.Frame(a=[0, 2, 3], b=[3, 4, 2])
    G = dt.Frame(b=[2, 4], c=["foo", "bar"])
    G.key = "b"
    R = F[:, g.c, join(G)]
    R.internal.check()
    assert R.shape == (3, 1)
    assert R.stypes == (stype.str32, )
    # assert R.names == ("c",)   # not working yet
    assert R.topython() == [[None, "bar", "foo"]]
def test_dt_isna_joined():
    # See issue #2109
    DT = dt.Frame(A=[None, 4, 3, 2, 1])
    JDT = dt.Frame(A=[0, 1, 3, 7],
                   B=['a', 'b', 'c', 'd'],
                   C=[0.25, 0.5, 0.75, 1.0],
                   D=[22, 33, 44, 55],
                   E=[True, False, True, False])
    JDT.key = 'A'
    RES = DT[:, dt.math.isna(g[1:]), join(JDT)]
    frame_integrity_check(RES)
    assert RES.to_list() == [[True, True, False, True, False]] * 4
Exemple #27
0
def analyzeDailyAltersgruppen(fullTable, byDayTable, filter, Altersgruppen,
                              Geschlechter, postfix):
    #byDayTable = analyzeDaily(fullTable, filter, postfix)
    #print("----- analyzeDailyAltersgruppen:"+postfix)

    for ag in Altersgruppen:
        print("Analyzing Altergruppe " + ag)
        byDayTableAG = analyzeDaily(fullTable,
                                    filter & (dt.f.Altersgruppe == ag),
                                    postfix + "-AG-" + ag)
        byDayTable = byDayTable[:, :, dt.join(byDayTableAG)]
        byDayTable.key = "DatenstandTag"
    return byDayTable
Exemple #28
0
def test_join_simple():
    d0 = dt.Frame([[1, 3, 2, 1, 1, 2, 0], list("abcdefg")], names=("A", "B"))
    d1 = dt.Frame([range(4), ["zero", "one", "two", "three"]], names=("A", "V"),
                  stypes=d0.stypes)
    d1.key = "A"
    res = d0[:, :, join(d1)]
    frame_integrity_check(res)
    assert res.shape == (7, 3)
    assert res.names == ("A", "B", "V")
    assert res.to_list() == [
        [1, 3, 2, 1, 1, 2, 0],
        ["a", "b", "c", "d", "e", "f", "g"],
        ["one", "three", "two", "one", "one", "two", "zero"]]
Exemple #29
0
def analyzeDailyAltersgruppen(fullTable, byDayTable, fromDay, toDay, byCriteria, criteriaValue, filter, Altersgruppen, Geschlechter, postfix):
    #byDayTable = analyzeDaily(fullTable, filter, postfix)
    #print("----- analyzeDailyAltersgruppen:"+postfix)

    for ag in Altersgruppen:
        if ag != "unbekannt":
            print("Analyzing Altergruppe "+ ag)

            fullfilter = filter & (dt.f.Altersgruppe == ag)
            byDayTableAG = analyzeDailyAndMeldeTag(fullTable, fromDay, toDay, byCriteria, criteriaValue, fullfilter, postfix+"_AG_"+agColName(ag))
            byDayTable = byDayTable[:,:,dt.join(byDayTableAG)]
            byDayTable.key = "DatenstandTag"
    return byDayTable
Exemple #30
0
def test_join_multi():
    fr1 = dt.Frame(A=[1, 2, 1, 2],
                   B=[3, 3, 4, 4],
                   C=["goo", "blah", "zoe", "rij"])
    fr1.key = ("A", "B")
    fr2 = dt.Frame([[1, 2, 3, 2, 3, 1, 2, 1, 1],
                    [3, 4, 5, 4, 3, 3, 3, 4, 3]], names=("A", "B"))
    res = fr2[:, :, join(fr1)]
    assert res.names == ("A", "B", "C")
    assert res.to_list() == [[1, 2, 3, 2, 3, 1, 2, 1, 1],
                             [3, 4, 5, 4, 3, 3, 3, 4, 3],
                             ["goo", "rij", None, "rij", None,
                              "goo", "blah", "zoe", "goo"]]