Beispiel #1
0
def test_groups_large2_str(n, seed):
    random.seed(seed)
    while n == 0:
        n = int(random.expovariate(0.0005))
    src = ["%x" % random.getrandbits(6) for _ in range(n)]
    f0 = dt.Frame({"A": src})
    f1 = f0[:, count(), by("A")]
    f1.internal.check()
    assert f1.nrows == len(set(src))
Beispiel #2
0
def py_dt_two_group_proportions_summary(DT,por1,por2):
    
    DT_summary = DT[:,dt.count(),by(f[por1],f[por2])
                   ][:,f[:].extend({'group_tot':dt.sum(f.count)}),by(f[por1])
                    ][:,f[:].extend({'prop':f.count/f.group_tot})
                     ][:,f[:].remove(f[1])
                      ]
    
    return DT_summary
Beispiel #3
0
def test_issue_2242(seed):
    n = 25000
    X = dt.Frame(AGE=[random.randint(1, 50) for i in range(n)],
                 PAY=[random.choice([True, False]) for i in range(n)])
    RES = X[:, dt.math.log((count() + 1) / (sum(f.PAY) + 0.5) - 1), by(f.AGE)]
    assert RES.shape == (50, 2)
    data = RES.to_list()
    assert data[0] == list(range(1, 51))
    assert all(isinstance(x, float) for x in data[1])
Beispiel #4
0
def test_issue2348():
    DT = dt.Frame(A=[1, 2, 3, 1, 2, 3], B=list('akdfnv'),
                  C=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
                  D=[11]*6, E=[2]*6)
    # Check that these expressions do not crash
    DT[:, :, by(f.A), sort(f.A, f.E)]
    DT[:, :, by(f.A, f.B), sort(f.A, f.B)]
    assert_equals(DT[:, dt.count(), by(f.D), sort(f.E, f.A)],
                  dt.Frame([[11], [6]],
                           names=["D", "count"],
                           stypes=[dt.int32, dt.int64]))
Beispiel #5
0
def test_sort_multicolumn1():
    # See issue #3141
    DT = dt.Frame(A=[111] * 100,
                  B=['a', 'b'] * 50,
                  C=['ads', 'adfv', 'adfv', 'adsfv'] * 25)
    RES1 = DT[:, dt.count(), dt.by(f.A, f.B, f.C)]
    assert_equals(
        RES1,
        dt.Frame(A=[111, 111, 111, 111],
                 B=['a', 'a', 'b', 'b'],
                 C=['adfv', 'ads', 'adfv', 'adsfv'],
                 count=[25, 25, 25, 25] / dt.int64))
    RES2 = DT[:, dt.count(), dt.by(f.B, f.C)]
    assert_equals(
        RES2,
        dt.Frame(B=['a', 'a', 'b', 'b'],
                 C=['adfv', 'ads', 'adfv', 'adsfv'],
                 count=[25, 25, 25, 25] / dt.int64))
    RES3 = DT[:, dt.count(), dt.by(f.A, f.B)]
    assert_equals(
        RES3, dt.Frame(A=[111, 111], B=['a', 'b'], count=[50, 50] / dt.int64))
Beispiel #6
0
def test_group_empty_frame2():
    DT = dt.Frame(A=[])
    D1 = DT[:, count(), by(f.A)]
    frame_integrity_check(D1)
    assert D1.shape == (0, 2)
    assert D1.stypes == (DT.stype, dt.int64)
Beispiel #7
0
def test_group_boolean2():
    DT = dt.Frame(A=[True, False, False] * 500 + [None, True])
    DTR = DT[:, count(), by(f.A)]
    assert_equals(DTR, dt.Frame(A=[None, False, True], count=[1, 1000, 501],
                                stypes={"count": dt.int64}))
Beispiel #8
0
          time_sec=t,
          mem_gb=m,
          cache=cache,
          chk=make_chk(flatten(chk.to_list())),
          chk_time_sec=chkt,
          on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans

question = 'sum v3 count by id1:id6'  # q10
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {
    'v3': sum(f.v3),
    'count': count()
},
        by(f.id1, f.id2, f.id3, f.id4, f.id5, f.id6)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v3), sum(f.count)]]
chkt = timeit.default_timer() - t_start
write_log(task=task,
          data=data_name,
          in_rows=x.shape[0],
          question=question,
          out_rows=ans.shape[0],
          out_cols=ans.shape[1],
          solution=solution,
Here is our basic syntax representation of datatable frame- 


                                        DT[I,J,BY|SORT|JOIN]

A sample dataframe created with the column name **languages** and would like to count how many of students are interested in learning each language category using aggregations such as by along with count,min,max,mean etc etc..

Yes, its correct we should use a function called **count** to caluclate the number of observations and let us see how it works below. 

prog_lang_dt = dt.Frame(languages= ['html', 'R', 'R', 'html', 'R', 'javascript',
                                    'R', 'javascript', 'html'])

prog_lang_dt

prog_lang_dt[:,count(),by(f.languages)]

If we would like like to rename a count column as total it can be done as follows,

prog_lang_dt[:,{'total':count()},by(f.languages)]

**count** can also take a column name as argument and report how many of non-missing entries in that specific column. for this example we will create a new dataframe as Y.

data = """
       id|charges|payment_method
       634-VHG|28|Cheque
       365-DQC|33.5|Credit card
       264-PPR|631|--
       845-AJO|42.3|
       789-KPO|56.9|Bank Transfer
       """
Beispiel #10
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        # Download files
        # Location in DAI file system where we will save the data set
        temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)

        # URL of desired data, this comes from the City of Seattle
        link_basics = "https://datasets.imdbws.com/title.basics.tsv.gz"
        link_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"
        link_episodes = "https://datasets.imdbws.com/title.episode.tsv.gz"

        # Download the files
        file_basics = download(link_basics, dest_path=temp_path)
        file_ratings = download(link_ratings, dest_path=temp_path)
        file_episodes = download(link_episodes, dest_path=temp_path)

        # get COVID19 new cases data from Our World in Data github
        basics = dt.fread(file_basics, fill=True)
        ratings = dt.fread(file_ratings, fill=True)
        episodes = dt.fread(file_episodes, na_strings=['\\N'], fill=True)

        # remove files
        os.remove(file_basics)
        os.remove(file_ratings)
        os.remove(file_episodes)

        # Create Title with Ratings dataset
        # join titles with non-null ratings
        ratings = ratings[~dt.isna(dt.f.averageRating), :]
        ratings.key = "tconst"
        basics_ratings = basics[:, :, dt.join(ratings)]

        # Create Episodes dataset
        episodes = episodes[~dt.isna(dt.f.seasonNumber)
                            & ~dt.isna(dt.f.episodeNumber), :]
        episode_ratings = episodes[:, :, dt.join(ratings)]
        episode_ratings.names = {
            'tconst': 'episodeTconst',
            'parentTconst': 'tconst',
            'averageRating': 'episodeAverageRating',
            'numVotes': 'episodeNumVotes'
        }
        basics_ratings.key = 'tconst'
        title_episode_ratings = episode_ratings[:, :, dt.join(basics_ratings)]

        # enumerate series episodes from 1 to N
        title_episode_ratings = title_episode_ratings[:, :,
                                                      dt.sort(
                                                          dt.f.tconst, dt.f.
                                                          seasonNumber, dt.f.
                                                          episodeNumber)]
        result = title_episode_ratings[:, dt.count(),
                                       dt.by(dt.f.tconst)][:,
                                                           'count'].to_list()
        from itertools import chain
        cumcount = chain.from_iterable([i + 1 for i in range(n)]
                                       for n in result[0])
        title_episode_ratings['episodeSequence'] = dt.Frame(tuple(cumcount))

        # return datasets
        return {
            f"imdb_title_ratings": basics_ratings,
            f"imdb_episode_ratings": title_episode_ratings
        }
# Output:
#   dataset with downsampled majority class

from sklearn.utils import resample
import pandas as pd
from datatable import f, count, sort, by, min, max, rbind

# parameters
target_col = "Known_Fraud"
times = 5
random_seed = 123

new_dataset_name = "new_dataset_name_with_downsampled_majority"

# counts by target groups
g = X[:, {"count": count()}, by(target_col)]
if not g.shape[1] == 2:
    raise ValueError(
        "Not a binary target - target column must contain exactly 2 values.")

# find sizes and target values for minority and majority class partitions
n_minority = g[:, min(f.count)][0, 0]
n_majority = g[:, max(f.count)][0, 0]
target_minority = g[f.count == n_minority, target_col][0, 0]
target_majority = g[f.count == n_majority, target_col][0, 0]

# validate that times indeed downsamples majority class
if times * n_minority >= n_majority:
    raise ValueError(
        "Downsampling coefficient `times` is too large: downsampled dataset results in inflated majority class."
    )
Beispiel #12
0
# ~ 3b ~
# Investigate Unknown HPO Codes
# For each code listed in 'hpo_codes_not_found', investigate the code to
# determine how to reconcile the invalid code. In some instances, you can
# add a new record in `rd3_phenotype`, but some codes may have changed or
# are no longer used. If you are still unable to find an answer, contact
# the data provider.
#

# prep "HPO codes to verify" dataset
hpoCodes = dt.Frame(hpo_codes_not_found, types={
    'id': str,
    'hpo': str
})[:, {
    'frequency': count(),
    'url': None,
    'status': 'not.found',
    'action': None
},
   by(f.hpo)]

# set url for HPO code if it exists
hpoCodes['url'] = dt.Frame([
    f'http://purl.obolibrary.org/obo/{d}' for d in hpoCodes['hpo'].to_list()[0]
])

# manually check each link and search for code. Either add the code or follow
# up with SolveRD project data coordinators.
rbind(fread('data/unknown_hpo_codes.csv'),
      hpoCodes).to_csv('data/unknown_hpo_codes.csv')
Beispiel #13
0
                    'TMIN': "temp_min",
                    'TMAX': "temp_max"}

weather_dt

weather_dt.key="stop_date"

# count the number of missing values
policia_dt.countna()

del policia_dt[:,['county_name', 'state']]

# glance
policia_dt

policia_dt[:,count(),by(f.driver_gender)]

policia_tidy_dt = policia_dt[~dt.isna(f.driver_gender),:]

policia_tidy_dt[:,count(),by(f.violation)
               ][:,f[:].extend({'grand_tot':dt.sum(f.count)})
                ][:,f[:].extend({'prop':f.count/f.grand_tot})
                 ][:,f[:].remove(f.grand_tot),sort(-f.prop)
                  ]

# custom function to generate a summary report per a single group column
def py_dt_one_group_proportions_summary(DT,por):
    
    DT_summary = DT[:,dt.count(),by(f[por])
                   ][:,f[:].extend({'grand_tot':dt.sum(f.count)})
                    ][:,f[:].extend({'prop':f.count/f.grand_tot})
gains = defaultdict(list)
for experimento in os.listdir(f'../experimentos/'):
    for file in os.listdir(f'../experimentos/{experimento}/'):
        if file.endswith('importance.csv'):
            with open(f'../experimentos/{experimento}/{file}') as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    weights[row['variable']].append(float(row['weight']))
                    gains[row['variable']].append(float(row['gain']))

weights = pad_dict_list(weights)
gains = pad_dict_list(gains)

df_weights = dt.Frame(weights).mean().to_pandas().T
df_gains = dt.Frame(gains).mean().to_pandas().T
df_count = dt.Frame(weights)[:, dt.count(dt.f[:])].to_pandas().T

df = pd.DataFrame(columns=['variable', 'weight', 'gain'])
df['variable'] = df_weights.index.values
df['gain'] = df_gains[0].values
df['count'] = df_count[0].values
df['weight'] = df_weights[0].values

df = df.sort_values(by=['gain', 'count', 'weight'], ascending=False)

df.to_csv('../analisis_feature_importance.csv', index=False)




penguins_dt[:, update(temp=f.body_mass_g == dt.max(f.body_mass_g)), by(f.sex)]

# step - 2 : finding a max value of body_mass of penguins per sex
penguins_dt[f.temp == 1, f[:].remove(f.temp)]

# step - 1 : finding a min value of body_mass of penguins per sex
penguins_dt[:, update(temp=f.body_mass_g == dt.min(f.body_mass_g)), by(f.sex)]

penguins_dt[f.temp == 1, f[:].remove(f.temp)]

del penguins_dt["temp"]

penguins_tidy_dt = penguins_dt[~dt.isna(f.sex), :]

penguins_year_island = penguins_tidy_dt[:, {
    'total': count()
}, by(f.year, f.island)]

penguins_year = penguins_year_island[:, {
    'gr_total': dt.sum(f.total)
}, by(f.year)]

penguins_year

penguins_year.key = "year"

penguins_year_island = penguins_year_island[:, :, dt.join(penguins_year)]

penguins_year_island[:, update(perc=f.total / f.gr_total)]

penguins_year_island
    def score(self,
              actual: np.array,
              predicted: np.array,
              sample_weight: typing.Optional[np.array] = None,
              labels: typing.Optional[List[any]] = None,
              X: typing.Optional[dt.Frame] = None,
              **kwargs) -> float:

        # Get the logger if it exists
        logger = self.get_experiment_logger()

        # hard-coded as access to experiment parameters (such as self.tgc) not yet available
        tgc = ["Store", "Dept"]
        # tgc = ["state"]
        # tgc = None

        # enable weighted average over TS R2 scores: weighted based on TS share of rows
        isR2AverageWeighted = False

        # obtain a scorer for metric to use
        scorer = self.get_scorer()

        if tgc is None or not all(col in X.names for col in tgc):
            loggerinfo(
                logger,
                f"TS R2 computes single R2 on {X.nrows} rows as either tgc {tgc} is not defined or incorrect."
            )
            return scorer.score(actual, predicted, sample_weight, labels,
                                **kwargs)
        else:
            tgc_values = X[:, {
                "weight": count() / X.nrows,
                "r2": 0.0
            }, by(tgc)]
            loggerinfo(
                logger,
                f"TS R2 computes multiple R2 on {X.nrows} rows, tgc {tgc} with weighting is {isR2AverageWeighted}."
            )
            none_values = [None] * X.nrows
            X = cbind(
                X[:, tgc],
                Frame(actual=actual,
                      predicted=predicted,
                      sample_weight=sample_weight
                      if sample_weight is not None else none_values))

            for i in range(0, tgc_values.nrows):
                current_tgc = tgc_values[i, :]
                current_tgc.key = tgc
                ts_frame = X[:, :, join(current_tgc)][~isna(f.r2), :]
                r2_score = scorer.score(
                    ts_frame['actual'].to_numpy(),
                    ts_frame['predicted'].to_numpy(),
                    ts_frame['sample_weight'].to_numpy()
                    if sample_weight is not None else None, labels, **kwargs)
                tgc_values[i, f.r2] = r2_score

                loggerinfo(
                    logger,
                    f"TS R2 = {r2_score} on {ts_frame.nrows} rows, tgc = {current_tgc[0, tgc].to_tuples()}"
                )

            if isR2AverageWeighted:
                # return np.average(tgc_values["r2"].to_numpy(), weights=tgc_values["weight"].to_numpy())
                return tgc_values[:, mean(f.r2 * f.weight)][0, 0]
            else:
                return tgc_values[:, mean(f.r2)][0, 0]
weather_dt = dt.fread(
    'https://assets.datacamp.com/production/repositories/1497/datasets/02f3fb2d4416d3f6626e1117688e0386784e8e55/weather.csv',
    na_strings=[""])

policia_dt

weather_dt

# count the number of missing values
policia_dt.countna()

del policia_dt[:, ['county_name', 'state']]

policia_dt

policia_dt[:, count(), by(f.driver_gender)]

policia_tidy_dt = policia_dt[~dt.isna(f.driver_gender), :]

policia_tidy_df = policia_tidy_dt.to_pandas()

policia_tidy_df.info()

policia_tidy_df

policia_tidy_dt[:, count(),
                by(f.violation)][:, f[:].extend({'grand_tot': dt.sum(
                    f.count)})][:, f[:].extend({'prop': f.count / f.grand_tot}
                                               )][:, f[:].remove(f.grand_tot),
                                                  sort(-f.prop)]
Beispiel #18
0
def test_group_empty_frame3():
    DT = dt.Frame(A=[], stype=dt.float32)
    D2 = DT[:, count(f.A), by(f.A)]
    frame_integrity_check(D2)
    assert D2.shape == (0, 2)
    assert D2.stypes == (DT.stype, dt.int64)
Beispiel #19
0
def test_count_with_i():
    # See issue 1316
    DT = dt.Frame(A=range(100))
    assert DT[:5, count()][0, 0] == 5
    assert DT[-12:, count()][0, 0] == 12
    assert DT[::3, count()][0, 0] == 34
Beispiel #20
0
def test_group_boolean3():
    DT = dt.Frame(A=[True] * 1234)
    DTR = DT[:, count(), by(f.A)]
    assert_equals(DTR,
                  dt.Frame(A=[True], count=[1234], stypes={"count": dt.int64}))
Beispiel #21
0
def test_count_2d_array_integer():
    a_in = [[9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1],
            [0, 1, 0, 5, 3, 8, 1, 0, 2, 5, None, 8, 1]]
    a_reduce = count(a_in)
    assert a_reduce == 2
from datatable import f,by,count,update,sort,join
import re

dt.options.display.head_nrows=4
dt.options.display.tail_nrows=4
dt.init_styles()

# Importign data
amigos_info_dt = dt.fread('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends_info.csv')
amigos_dt = dt.fread('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends.csv')

# Glance
amigos_info_dt

# Seasons
amigos_info_dt[:,count(),by(f.season)]

# Unique episodes per a season
amigos_info_dt[:,count(),by(f.season,f.episode)
              ][:,{'unique_episodes':count()},by(f.season)
               ]

# average views and ratings per season
amigos_info_dt[:,dt.mean(f[-2:]),by(f.season)]

# Highest rating title
amigos_info_dt[f.imdb_rating==dt.max(f.imdb_rating),:]

# lowest rating title
amigos_info_dt[f.imdb_rating==dt.min(f.imdb_rating),:]
#ans = x[:, {"r2": cor(v1, v2)^2}, by(f.id2, f.id4)]
#print(ans.shape, flush=True)
#t = timeit.default_timer() - t_start
#m = memory_usage()
#t_start = timeit.default_timer()
#chk = ans[:, sum(f.r2)]
#chkt = timeit.default_timer() - t_start
#write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
#print(ans.head(3).to_pandas(), flush=True)
#print(ans.tail(3).to_pandas(), flush=True)
#del ans

question = "sum v3 count by id1:id6" # q10
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v3": sum(f.v3), "count": count()}, by(f.id1, f.id2, f.id3, f.id4, f.id5, f.id6)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v3), sum(f.count)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v3": sum(f.v3), "count": count()}, by(f.id1, f.id2, f.id3, f.id4, f.id5, f.id6)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
Beispiel #24
0
def test_groups_large1():
    n = 251 * 4000
    xs = [(i * 19) % 251 for i in range(n)]
    f0 = dt.Frame({"A": xs})
    f1 = f0[:, count(), by("A")]
    assert f1.to_list() == [list(range(251)), [4000] * 251]
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        # define date column and forecast horizon
        date_col = 'date'
        group_by_cols = ["state"]
        forecast_len = 7

        # state codes lookup table
        us_state_codes = dt.Frame(
            code=[
                'AL', 'AK', 'AS', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC',
                'FL', 'GA', 'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY',
                'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE',
                'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'MP', 'OH', 'OK',
                'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT',
                'VI', 'VA', 'WA', 'WV', 'WI', 'WY'
            ],
            state=[
                'Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas',
                'California', 'Colorado', 'Connecticut', 'Delaware',
                'District of Columbia', 'Florida', 'Georgia', 'Guam', 'Hawaii',
                'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
                'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
                'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
                'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
                'New York', 'North Carolina', 'North Dakota',
                'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon',
                'Pennsylvania', 'Puerto Rico', 'Rhode Island',
                'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
                'Vermont', 'Virgin Islands', 'Virginia', 'Washington',
                'West Virginia', 'Wisconsin', 'Wyoming'
            ])
        us_state_codes.key = "state"

        # get states population lookup table
        us_states_pop = dt.fread(
            "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv"
        )
        us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'}
        us_states_pop = us_states_pop[dt.f.STATE > 0, :]
        us_states_pop.key = "state"

        # join state codes and population into single lookup table
        us_states_pop[:, dt.update(code=dt.g.code), dt.join(us_state_codes)]
        us_states_pop.key = "code"

        # US Covid Tracking API: https://covidtracking.com/data/api
        us_states = dt.fread(
            "https://covidtracking.com/api/v1/states/daily.csv")
        # remove deprecated fields
        deprecated = [
            'checkTimeEt', 'commercialScore', 'dateChecked', 'dateModified',
            'grade', 'hash', 'hospitalized', 'negativeIncrease',
            'negativeRegularScore', 'negativeScore', 'posNeg', 'positiveScore',
            'score', 'total'
        ]
        us_states = us_states[:, list(set(us_states.names) - set(deprecated))]
        us_states.names = {'state': 'code'}

        series_cols = [
            "positive", "negative", "hospitalizedCumulative",
            "inIcuCumulative", "onVentilatorCumulative", "recovered", "death"
        ]
        aggs = {f"{col}100k": f[col] / (g.pop / 100000) for col in series_cols}
        us_states[:,
                  dt.update(
                      state=g.state, pop=g.pop, pop100k=g.pop / 10000, **aggs),
                  join(us_states_pop)]
        us_states = us_states[~dt.isna(dt.f.state), :]

        # produce lag of 1 unit and add as new feature for each shift column
        series_cols.extend([col + "100k" for col in series_cols])
        aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols}
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        # update NA lags
        aggs = {f"{col}_yesterday": 0 for col in series_cols}
        us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)]

        aggs = {
            f"{col}_daily": f[col] - f[f"{col}_yesterday"]
            for col in series_cols
        }
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        for col in series_cols:
            del us_states[:, f[f"{col}_yesterday"]]

        # validate dataset
        if us_states[:, count(),
                     by(dt.f.state, f.date)][f.count > 1, :].shape[0] > 1:
            raise ValueError(
                "Found duplicate elements for the same date and state.")

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_states[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_states[date_col].to_pandas()
        train = us_states[df[date_col] <= split_date, :]
        test = us_states[df[date_col] > split_date, :]

        return {
            f"covidtracking_daily_{split_date}_by_us_states_train": train,
            f"covidtracking_daily_{test_date}_by_us_states_test": test
        }
Beispiel #26
0
def test_count_array_integer():
    a_in = [9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]
    a_reduce = count(a_in)
    assert a_reduce == 10
Beispiel #27
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config
        import bz2

        def extract_bz2(file, output_file):
            zipfile = bz2.BZ2File(file)
            data = zipfile.read()
            open(output_file, 'wb').write(data)

        temp_path = os.path.join(config.data_directory,
                                 config.contrib_relative_directory, "airlines")
        os.makedirs(temp_path, exist_ok=True)

        # specify which years are used for training and testing
        training = [2007]
        testing = [2008]

        # download and unzip files
        files = []
        for f in ["%d.csv.bz2" % year for year in training + testing]:
            link = "http://stat-computing.org/dataexpo/2009/%s" % f
            file = download(link, dest_path=temp_path)
            output_file = file.replace(".bz2", "")
            extract_bz2(file, output_file)
            files.append(output_file)

        # parse with datatable
        X = dt.rbind(*[dt.fread(x) for x in files])

        # add date
        date_col = 'Date'
        X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[
            'DayofMonth']
        cols_to_keep = ['Date']

        # add number of flights in/out for each airport per given interval
        timeslice_mins = 60
        for name, new_col, col, group in [
            ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
            ("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
        ]:
            X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
            group_cols = [date_col, group, new_col]
            new_name = 'flights_%s' % name
            flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
            flights.key = group_cols
            cols_to_keep.append(new_name)
            X = X[:, :, dt.join(flights)]

        # Fill NaNs with 0s
        X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0
        cols_to_keep.extend([
            'DepDelay',
            'Year',
            'Month',
            'DayofMonth',
            'DayOfWeek',
            'CRSDepTime',
            'UniqueCarrier',
            'FlightNum',
            'TailNum',
            'CRSElapsedTime',
            'Origin',
            'Dest',
            'Distance',
            # Leaks for delay
            # 'DepTime',
            # 'ArrTime', #'CRSArrTime',
            # 'ActualElapsedTime',
            # 'AirTime', #'ArrDelay', #'DepDelay',
            # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay',
            # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay',
        ])
        X = X[:, cols_to_keep]

        # Join in some extra info
        join_files = [('UniqueCarrier', 'carriers.csv', 'Code'),
                      ('Origin', 'airports.csv', 'iata'),
                      ('Dest', 'airports.csv', 'iata'),
                      ('TailNum', 'plane-data.csv', 'tailnum')]

        for join_key, file, col in join_files:
            file = download('http://stat-computing.org/dataexpo/2009/%s' %
                            file,
                            dest_path=temp_path)
            X_join = dt.fread(file, fill=True)
            X_join.names = {col: join_key}
            X_join.names = [join_key] + [
                join_key + "_" + x for x in X_join.names if x != join_key
            ]
            X_join.key = join_key
            X = X[:, :, dt.join(X_join)]
            del X[:, join_key]

        split = False

        if not split:
            filename = os.path.join(
                temp_path, "flight_delays_regression_%d-%d.jay" %
                (min(training), max(testing)))
            X.to_jay(filename)
            return filename
        else:
            # prepare splits (by year) and create binary .jay files for import into Driverless AI
            output_files = []
            for condition, name in [
                ((min(training) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(training)), 'training'),
                ((min(testing) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(testing)), 'test'),
            ]:
                X_split = X[condition, :]
                filename = os.path.join(temp_path,
                                        "flight_delays_%s.jay" % name)
                X_split.to_jay(filename)
                output_files.append(filename)
            return output_files
Beispiel #28
0
def test_count_array_string():
    a_in = [None, "blue", "green", "indico", None, None, "orange", "red",
            "violet", "yellow", "green", None, "blue"]
    a_reduce = count(a_in)
    assert a_reduce == 9
Beispiel #29
0
ans = x[:, {"r2": corr(f.v1, f.v2)**2}, by(f.id2, f.id4)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.r2)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
print(ans.head(3).to_pandas(), flush=True)
print(ans.tail(3).to_pandas(), flush=True)
del ans

question = "sum v3 count by id1:id6" # q10
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v3": sum(f.v3), "count": count()}, by(f.id1, f.id2, f.id3, f.id4, f.id5, f.id6)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v3), sum(f.count)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v3": sum(f.v3), "count": count()}, by(f.id1, f.id2, f.id3, f.id4, f.id5, f.id6)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
Beispiel #30
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config
        import bz2

        def extract_bz2(file, output_file):
            zipfile = bz2.BZ2File(file)
            data = zipfile.read()
            open(output_file, 'wb').write(data)

        temp_path = os.path.join(user_dir(), "recipe_tmp", "airlines")
        os.makedirs(temp_path, exist_ok=True)
        dt.options.nthreads = 8

        # specify which years are used for training and testing
        training = list(range(2005, 2008))
        testing = [2008]

        # download and unzip files
        files = []
        for f in ["%d.csv.bz2" % year for year in training + testing]:
            link = AirlinesData.base_url + "%s" % f
            file = download(link, dest_path=temp_path)
            output_file = file.replace(".bz2", "")
            if not os.path.exists(output_file):
                extract_bz2(file, output_file)
            files.append(output_file)

        # parse with datatable
        X = dt.rbind(*[dt.fread(x) for x in files])

        # add date
        date_col = 'Date'
        X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[
            'DayofMonth']
        cols_to_keep = ['Date']

        # add number of flights in/out for each airport per given interval
        timeslice_mins = 60
        for name, new_col, col, group in [
            ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
            ("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
        ]:
            X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
            group_cols = [date_col, group, new_col]
            new_name = 'flights_%s_per_%d_min' % (name, timeslice_mins)
            flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
            flights.key = group_cols
            cols_to_keep.append(new_name)
            X = X[:, :, dt.join(flights)]

        # select flights leaving from SFO only
        X = X[dt.f['Origin'] == 'SFO', :]

        # Fill NaNs in DepDelay column
        X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0

        # create binary target column
        depdelay_threshold_mins = 15
        target = 'DepDelay%dm' % depdelay_threshold_mins
        X[:, target] = dt.f['DepDelay'] > depdelay_threshold_mins
        cols_to_keep.extend([
            target,
            'Year',
            'Month',
            'DayofMonth',
            'DayOfWeek',
            'CRSDepTime',
            'UniqueCarrier',
            'FlightNum',
            'TailNum',
            'CRSElapsedTime',
            'Origin',
            'Dest',
            'Distance',
            # Leaks for delay
            # 'DepTime',
            # 'ArrTime', #'CRSArrTime',
            # 'ActualElapsedTime',
            # 'AirTime', #'ArrDelay', #'DepDelay',
            # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay',
            # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay',
        ])
        X = X[:, cols_to_keep]

        # Join in some extra info
        join_files = [('UniqueCarrier', 'carriers.csv', 'Code'),
                      ('Origin', 'airports.csv', 'iata'),
                      ('Dest', 'airports.csv', 'iata'),
                      ('TailNum', 'plane-data.csv', 'tailnum')]

        for join_key, file, col in join_files:
            file = download(
                'https://0xdata-public.s3.amazonaws.com/data_recipes_data/%s' %
                file,
                dest_path=temp_path)
            X_join = dt.fread(file, fill=True)
            X_join.names = {col: join_key}
            X_join.names = [join_key] + [
                join_key + "_" + x for x in X_join.names if x != join_key
            ]
            X_join.key = join_key
            X = X[:, :, dt.join(X_join)]
            del X[:, join_key]

        split = True
        if not split:
            filename = os.path.join(
                temp_path, "flight_delays_data_recipe_%d-%d.csv" %
                (min(training), max(testing)))
            X.to_csv(filename)
            return filename
        else:
            # prepare splits (by year) and create binary .jay files for import into Driverless AI
            output_files = []
            for condition, name in [
                ((min(training) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(training)), 'training'),
                ((min(testing) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(testing)), 'test'),
            ]:
                X_split = X[condition, :]
                filename = os.path.join(
                    temp_path, "augmented_flights_%s-%d_%s.csv" %
                    (X_split[:, 'Year'].min1(), X_split[:,
                                                        'Year'].max1(), name))
                X_split.to_csv(filename)
                output_files.append(filename)
            return output_files
Beispiel #31
0
#t = timeit.default_timer() - t_start
#m = memory_usage()
#t_start = timeit.default_timer()
#chk = ans[:, sum(f.r2)]
#chkt = timeit.default_timer() - t_start
#write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
#print(ans.head(3).to_pandas(), flush=True)
#print(ans.tail(3).to_pandas(), flush=True)
#del ans

question = "sum v3 count by id1:id6"  # q10
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {
    "v3": sum(f.v3),
    "count": count()
},
        by(f.id1, f.id2, f.id3, f.id4, f.id5, f.id6)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v3), sum(f.count)]]
chkt = timeit.default_timer() - t_start
write_log(task=task,
          data=data_name,
          in_rows=x.shape[0],
          question=question,
          out_rows=ans.shape[0],
          out_cols=ans.shape[1],
          solution=solution,