def __init__(self, data_dir, output_dir, **kwargs): _data_cols = ['x', 'x_len', 'c', 'c_len', 'text'] data_cols = [] # Make sure all of the columns exist for i in _data_cols: if os.path.exists(os.path.join(data_dir, '{}.npy'.format(i))): data_cols.append(i) f = [] for i in data_cols: f.append(np.load(os.path.join(data_dir, '{}.npy'.format(i)))) data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols] self.test_df = DataFrame(columns=data_cols, data=data, **kwargs) self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95, random_state=2018) date_str = datetime.now().strftime('%Y-%m-%d_%H-%M') np.save(Path(output_dir) / date_str, self.val_df.dict) # Load the strokes # np.load("../checkpoints/original/2020-03-13_15-03.npy", allow_pickle=True).item()["x"] print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df))
def __init__(self, data_dir): data_cols = ['order_id', 'product_id', 'features', 'label'] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols ] df = DataFrame(columns=data_cols, data=data) self.data_dim = df['features'].shape[1] print(df.shapes()) print('loaded data') self.test_df = df.mask(df['label'] == -1) self.train_df = df.mask(df['label'] != -1) self.train_df, self.val_df = self.train_df.train_test_split( train_size=0.9) print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df)) self.feature_means = np.load( os.path.join(data_dir, 'feature_means.npy')) self.feature_maxs = np.load(os.path.join(data_dir, 'feature_maxs.npy')) self.feature_mins = np.load(os.path.join(data_dir, 'feature_mins.npy'))
def __init__(self, data_dir): data_cols = [ 'user_id', 'history_length', 'order_size_history', 'reorder_size_history', 'order_number_history', 'order_dow_history', 'order_hour_history', 'days_since_prior_order_history', ] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) print self.test_df.shapes() print 'loaded data' self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.9) print 'train size', len(self.train_df) print 'val size', len(self.val_df) print 'test size', len(self.test_df)
def __init__(self, data_dir): data_cols = [ 'user_id', 'product_id', 'aisle_id', 'department_id', 'is_ordered_history', 'index_in_order_history', 'order_dow_history', 'order_hour_history', 'days_since_prior_order_history', 'order_size_history', 'reorder_size_history', 'order_is_weekend_history', 'order_part_of_day_history', 'order_number_history', 'history_length', 'product_name', 'product_name_length', 'eval_set', 'label' ] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) print(self.test_df.shapes()) print('loaded data') self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.9) print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df))
def __init__(self, data_dir): data_cols = [ 'order_id', 'product_id', 'features', 'label' ] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols] df = DataFrame(columns=data_cols, data=data) self.data_dim = df['features'].shape[1] print df.shapes() print 'loaded data' self.test_df = df.mask(df['label'] == -1) self.train_df = df.mask(df['label'] != -1) self.train_df, self.val_df = self.train_df.train_test_split(train_size=0.9) print 'train size', len(self.train_df) print 'val size', len(self.val_df) print 'test size', len(self.test_df) self.feature_means = np.load(os.path.join(data_dir, 'feature_means.npy')) self.feature_maxs = np.load(os.path.join(data_dir, 'feature_maxs.npy')) self.feature_mins = np.load(os.path.join(data_dir, 'feature_mins.npy'))
def __init__(self, data_dir): data_cols = [ 'x_raw', 'holidayinfo', 'air_store_id2', 'id', 'x', 'air_area_name', 'air_genre_name', 'latitude', 'longitude', 'start_date', 'x_lags', 'xy_lags', ] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.9) self.num_area_name = self.test_df['air_area_name'].max() + 1 self.num_genre_name = self.test_df['air_genre_name'].max() + 1 self.num_store_id = self.test_df['air_store_id2'].max() + 1 print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df))
def __init__( self, data_dir, seed, ): data_cols = ['all_df', 'all_id'] train_data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols ] self.full_train = DataFrame(columns=data_cols, data=train_data) self.test_df = self.full_train # seed=np.random.randint(0, 1000000) # seed = 99 + seed self.train_df, self.val_df = self.full_train.train_test_split( train_size=0.9, random_state=seed) print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df)) self.max_frames = 72 self.GLOBAL_IS_VAL = False self.GLOBAL_IS_TEST = False
def __init__(self, data): columns = ['x', 'y'] df = DataFrame(columns=columns, data=data) self.train_df, self.val_df = df.train_test_split( train_size=0.9, random_state=config.random_seed) self.num_products = df['y'].max() + 1 self.product_dist = np.bincount(df['y']).tolist()
def __init__(self, data): columns = ['i', 'j', 'V_ij'] df = DataFrame(columns=columns, data=data) self.train_df, self.val_df = df.train_test_split( train_size=0.9, random_state=config.random_seed) self.test_df = df self.num_users = df['i'].max() + 1 self.num_products = df['j'].max() + 1
def __init__(self, data_dir): data_cols = ['i', 'j', 'V_ij'] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols] df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = df.train_test_split(train_size=0.9) print('train size', len(self.train_df)) print('val size', len(self.val_df)) self.num_users = df['i'].max() + 1 self.num_products = df['j'].max() + 1
def __init__(self, data_dir): data_cols = ['x', 'y'] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols] df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = df.train_test_split(train_size=0.9) print 'train size', len(self.train_df) print 'val size', len(self.val_df) self.num_products = df['x'].max() + 1 self.product_dist = np.bincount(self.train_df['x']).tolist()
def __init__(self, data_dir): data_cols = ['data', 'is_nan', 'page_id', 'project', 'access', 'agent'] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.9) print 'train size', len(self.train_df) print 'val size', len(self.val_df) print 'test size', len(self.test_df)
def __init__(self, data_dir): data_cols = ['x', 'x_len', 'c', 'c_len'] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.95, random_state=2018) print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df))
def __init__(self): super().__init__() self.winfo_toplevel().title("Registro") self.resizable(0, 0) self.dataFrame = DataFrame(self, lambda: self.showFrame('register')) self.dataFrame.grid(row=0, column=0, sticky='NSEW') self.mainFrame = MainFrame(self, lambda: self.showFrame('data')) self.mainFrame.grid(row=0, column=0, sticky='NSEW') self.frames = dict() self.frames['register'] = self.mainFrame self.frames['data'] = self.dataFrame
def query_measurements_original(self, field_query, begin_time, end_time): """ Query for epidata measurements. Parameters ---------- field_query : dictionary containing either strings or lists of strings A dictionary containing field names and the values those fields must contain in matching measurements. Some system configurations require that values of specific fields be specified. A string field value represents an equality match, while a list value represents set membership (all values within the set are matched). begin_time : datetime Beginning of the time interval to query, inclusive. end_time : datetime End of the time interval to query, exclusive. Returns ------- result : epidata DataFrame A DataFrame containing measurements matching the query. """ self._check_cluster_memory() java_field_query, java_begin_time, java_end_time = self._to_java_params( field_query, begin_time, end_time) java_data_frame = self._jec.query(java_field_query, java_begin_time, java_end_time) return DataFrame(jdf=java_data_frame, sql_ctx=self._sql_ctx)
def __init__(self, data_dir): data_cols = [ 'user_id', 'aisle_id', 'department_id', 'eval_set', 'is_ordered_history', 'index_in_order_history', 'order_dow_history', 'order_hour_history', 'days_since_prior_order_history', 'order_size_history', 'order_number_history', 'num_products_from_aisle_history', 'history_length', ] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols] self.test_df = DataFrame(columns=data_cols, data=data) print self.test_df.shapes() print 'loaded data' self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.9) print 'train size', len(self.train_df) print 'val size', len(self.val_df) print 'test size', len(self.test_df)
class DataReader(object): def __init__(self, data_dir): data_cols = ['x', 'x_len', 'c', 'c_len'] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.95, random_state=2018) print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df)) def train_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.train_df, shuffle=True, num_epochs=10000, mode='train') def val_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.val_df, shuffle=True, num_epochs=10000, mode='val') def test_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.test_df, shuffle=False, num_epochs=1, mode='test') def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, mode='train'): gen = df.batch_generator(batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=(mode == 'test')) for batch in gen: batch['x_len'] = batch['x_len'] - 1 max_x_len = np.max(batch['x_len']) max_c_len = np.max(batch['c_len']) batch['y'] = batch['x'][:, 1:max_x_len + 1, :] batch['x'] = batch['x'][:, :max_x_len, :] batch['c'] = batch['c'][:, :max_c_len] yield batch
def __init__(self, data_dir): data_cols = ['x', 'y'] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols ] df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = df.train_test_split(train_size=0.9) print 'train size', len(self.train_df) print 'val size', len(self.val_df) self.num_products = df['x'].max() + 1 self.product_dist = np.bincount(self.train_df['x']).tolist()
def read_dataframe_from_csv(path): # fixme - need to close file descriptor. dataset_handle = open(path) reader = csv.reader(dataset_handle) headers = next(reader) data_frame = DataFrame(headers, reader) return data_frame
def __init__(self, data_dir): data_cols = ['x', 'x_len', 'c', 'c_len'] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols] self.test_df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95, random_state=2018) print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df))
def __init__(self, data_dir): data_cols = [ 'data', 'isNAN', 'page_id', 'project', 'access', 'agent', 'test_data', 'test_isNAN' ] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols] self.testDataframe = DataFrame(columns=data_cols, data=data) self.trainDataframe, self.valDataframe = self.testDataframe.train_test_split(train_size=0.95) print 'Size of trained data', len(self.trainDataframe) print 'val size', len(self.valDataframe) print 'size of test data', len(self.testDataframe)
def query_measurements_cleansed(self, field_query, begin_time, end_time): self._check_cluster_memory() java_field_query, java_begin_time, java_end_time = self._to_java_params( field_query, begin_time, end_time) java_data_frame = self._jec.queryMeasurementCleansed( java_field_query, java_begin_time, java_end_time) return DataFrame(jdf=java_data_frame, sql_ctx=self._sql_ctx)
def __init__(self, data_dir): data_cols = [ 'x_raw', 'onpromotion', 'id', 'x', 'store_nbr', 'item_nbr', 'city', 'state', 'type', 'cluster', 'family', 'class', 'perishable', 'is_discrete', 'start_date', 'x_lags', 'xy_lags', 'ts', ] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.95) self.num_city = self.test_df['city'].max() + 1 self.num_state = self.test_df['state'].max() + 1 self.num_type = self.test_df['type'].max() + 1 self.num_cluster = self.test_df['cluster'].max() + 1 self.num_family = self.test_df['family'].max() + 1 self.num_item_class = self.test_df['class'].max() + 1 self.num_perishable = self.test_df['perishable'].max() + 1 self.num_store_nbr = self.test_df['store_nbr'].max() + 1 self.num_item_nbr = self.test_df['item_nbr'].max() + 1 print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df))
class DataReader(object): def __init__(self, data_dir): data_cols = ['x', 'x_len', 'c', 'c_len'] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols] self.test_df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95, random_state=2018) print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df)) def train_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.train_df, shuffle=True, num_epochs=10000, mode='train' ) def val_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.val_df, shuffle=True, num_epochs=10000, mode='val' ) def test_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.test_df, shuffle=False, num_epochs=1, mode='test' ) def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, mode='train'): gen = df.batch_generator( batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=(mode == 'test') ) for batch in gen: batch['x_len'] = batch['x_len'] - 1 max_x_len = np.max(batch['x_len']) max_c_len = np.max(batch['c_len']) batch['y'] = batch['x'][:, 1:max_x_len + 1, :] batch['x'] = batch['x'][:, :max_x_len, :] batch['c'] = batch['c'][:, :max_c_len] yield batch
def __init__(self, data_dir): data_cols = [ 'data', 'is_nan', 'page_id', 'project', 'access', 'agent', 'test_data', 'test_is_nan' ] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols] # 把原始数据构造成DataFrame 145063 self.test_df = DataFrame(columns=data_cols, data=data) # 137809 7254 横向切分 self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95) print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df))
def list_keys(self): """ List the epidata measurement keys. Returns ------- result : epidata DataFrame A DataFrame containing values of the principal fields used for classifying measurements. """ self._check_cluster_memory() return DataFrame(jdf=self._jec.listKeys(), sql_ctx=self._sql_ctx)
def main(args): # set memory growth to true to fix potential memory issues physical_devices = tf.config.list_physical_devices('GPU') tf.config.experimental.set_memory_growth(physical_devices[0], True) # get data object data = DataFrame( path=args.dir, n_symbols_in_captcha=args.captcha_size, use_lowercase=args.use_lowercase, use_uppercase=args.use_uppercase, use_numbers=args.use_numbers ) # get data split (X_train, t_train), (X_test, t_test) = data.get_data(args.test_size) model = None if os.path.exists(args.model_save): # load previously trained model if path exists model = tf.keras.models.load_model(args.model_save) # train else: # init network model = ResNet18(n_classes=(data.get_num_symbols() * args.captcha_size), data_format='channels_last') # comile network with given params model.compile(loss='binary_crossentropy', optimizer=args.optm, metrics=["accuracy"]) # train network model.fit(X_train, t_train, batch_size=args.batch_size, epochs=args.epochs, verbose=1) # dump trained model to file system model.save(args.model_save) assert(model is not None) # evaluate performance score = model.evaluate(X_test, t_test, verbose=1)
def __init__(self, data_dir): data_cols = [ 'x', 'store_id', 'item_id', 'state_id', 'dept_id', 'cat_id', 'wday', 'month', 'event_name_1', 'event_type_1', 'snap', 'x_lags', 'xy_lags', 'ts', 'sell_price', 'sell_price_first_digit', # to try 'sell_price_last_digit', 'start_date', 'weights', 'hierarchy_data', 'all_id' ] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) # self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95) self.num_store = self.test_df['store_id'].max() + 1 self.num_item = self.test_df['item_id'].max() + 1 self.num_state = self.test_df['state_id'].max() + 1 self.num_dept = self.test_df['dept_id'].max() + 1 self.num_cat = self.test_df['cat_id'].max() + 1 self.num_wday = self.test_df['wday'].max() + 1 self.num_month = self.test_df['month'].max() + 1 self.num_event_name_1 = self.test_df['event_name_1'].max() + 1 self.num_event_type_1 = self.test_df['event_type_1'].max() + 1 # print 'train size', len(self.train_df) # print 'val size', len(self.val_df) print 'test size', len(self.test_df)
def main(): parser = argparse.ArgumentParser( description='Converts npz files to actual training libraries') parser.add_argument('dataset', help='The dataset folder.') args = parser.parse_args() print(args) data_cols = ['x', 'x_len', 'c', 'c_len'] data = [ np.load(os.path.join(args.dataset, '{}.npy'.format(i))) for i in data_cols ] dataFrame = DataFrame(columns=data_cols, data=data) dataDrawer = DataDrawer(dataFrame) dataDrawer.run()
def IMR(measurements, meas_names=None): """ Perform IMR analysis on a DataFrame of measurements. The measurements are grouped by the 'meas_name' field and IMR is performed on each group. Parameters ---------- measurements : epidata DataFrame A DataFrame containing measurements, as returned by EpidataContext.query. meas_names : list of strings, or string or None, default None A list of measurement names on which to perform IMR, or a single measurement name on which to perform IMR. If None, all measurements will be analyzed. Returns ------- result : epidata DataFrame A copy of the measurements DataFrame, with the IMR results appended as additional columns. """ from context import ec local = not isinstance(measurements, DataFrame) if local: raise ValueError('Unsupported local measurements argument to IMR.') if isinstance(meas_names, basestring): # Filter a single string measurement name. measurements = measurements.filter( measurements.meas_name == meas_names) elif meas_names: # Build a composite filter for a list of measurement names. condition = (measurements.meas_name == meas_names[0]) for name in meas_names[1:]: condition = (condition | (measurements.meas_name == name)) measurements = measurements.filter(condition) java_IMR = ec._sc._jvm.com.epidata.spark.analytics.IMR.get() jdf = java_IMR.applyToDataFrame(measurements._pdf._jdf) return DataFrame(jdf=jdf, sql_ctx=measurements._pdf.sql_ctx)
class DataReader(object): def __init__(self, data_dir): data_cols = [ 'user_id', 'history_length', 'order_size_history', 'reorder_size_history', 'order_number_history', 'order_dow_history', 'order_hour_history', 'days_since_prior_order_history', ] data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols] self.test_df = DataFrame(columns=data_cols, data=data) print self.test_df.shapes() print 'loaded data' self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.9) print 'train size', len(self.train_df) print 'val size', len(self.val_df) print 'test size', len(self.test_df) def train_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.train_df, shuffle=True, num_epochs=10000, is_test=False ) def val_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.val_df, shuffle=True, num_epochs=10000, is_test=False ) def test_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.test_df, shuffle=False, num_epochs=1, is_test=True ) def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False): batch_gen = df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test) for batch in batch_gen: batch['order_dow_history'] = np.roll(batch['order_dow_history'], -1, axis=1) batch['order_hour_history'] = np.roll(batch['order_hour_history'], -1, axis=1) batch['days_since_prior_order_history'] = np.roll(batch['days_since_prior_order_history'], -1, axis=1) batch['order_number_history'] = np.roll(batch['order_number_history'], -1, axis=1) batch['next_reorder_size'] = np.roll(batch['reorder_size_history'], -1, axis=1) if not is_test: batch['history_length'] = batch['history_length'] - 1 yield batch
class DataReader(object): def __init__(self, data_dir): data_cols = [ 'user_id', 'history_length', 'order_size_history', 'reorder_size_history', 'order_number_history', 'order_dow_history', 'order_hour_history', 'days_since_prior_order_history', ] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) print self.test_df.shapes() print 'loaded data' self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.9) print 'train size', len(self.train_df) print 'val size', len(self.val_df) print 'test size', len(self.test_df) def train_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.train_df, shuffle=True, num_epochs=10000, is_test=False) def val_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.val_df, shuffle=True, num_epochs=10000, is_test=False) def test_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.test_df, shuffle=False, num_epochs=1, is_test=True) def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False): batch_gen = df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test) for batch in batch_gen: batch['order_dow_history'] = np.roll(batch['order_dow_history'], -1, axis=1) batch['order_hour_history'] = np.roll(batch['order_hour_history'], -1, axis=1) batch['days_since_prior_order_history'] = np.roll( batch['days_since_prior_order_history'], -1, axis=1) batch['order_number_history'] = np.roll( batch['order_number_history'], -1, axis=1) batch['next_reorder_size'] = np.roll(batch['reorder_size_history'], -1, axis=1) if not is_test: batch['history_length'] = batch['history_length'] - 1 yield batch
class DataReader(object): def __init__(self, data_dir): data_cols = [ 'data', 'is_nan', 'page_id', 'project', 'access', 'agent', 'test_data', 'test_is_nan' ] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.95) print 'train size', len(self.train_df) print 'val size', len(self.val_df) print 'test size', len(self.test_df) def train_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.train_df, shuffle=True, num_epochs=10000, is_test=False) def val_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.val_df, shuffle=True, num_epochs=10000, is_test=False) def test_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.test_df, shuffle=True, num_epochs=1, is_test=True) def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False): batch_gen = df.batch_generator(batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test) data_col = 'test_data' if is_test else 'data' is_nan_col = 'test_is_nan' if is_test else 'is_nan' for batch in batch_gen: num_decode_steps = 64 full_seq_len = batch[data_col].shape[1] max_encode_length = full_seq_len - num_decode_steps if not is_test else full_seq_len x_encode = np.zeros([len(batch), max_encode_length]) y_decode = np.zeros([len(batch), num_decode_steps]) is_nan_encode = np.zeros([len(batch), max_encode_length]) is_nan_decode = np.zeros([len(batch), num_decode_steps]) encode_len = np.zeros([len(batch)]) decode_len = np.zeros([len(batch)]) for i, (seq, nan_seq) in enumerate( zip(batch[data_col], batch[is_nan_col])): rand_len = np.random.randint(max_encode_length - 365 + 1, max_encode_length + 1) x_encode_len = max_encode_length if is_test else rand_len x_encode[i, :x_encode_len] = seq[:x_encode_len] is_nan_encode[i, :x_encode_len] = nan_seq[:x_encode_len] encode_len[i] = x_encode_len decode_len[i] = num_decode_steps if not is_test: y_decode[i, :] = seq[x_encode_len:x_encode_len + num_decode_steps] is_nan_decode[i, :] = nan_seq[x_encode_len:x_encode_len + num_decode_steps] batch['x_encode'] = x_encode batch['encode_len'] = encode_len batch['y_decode'] = y_decode batch['decode_len'] = decode_len batch['is_nan_encode'] = is_nan_encode batch['is_nan_decode'] = is_nan_decode yield batch
class DataReader(object): def __init__(self, data_dir): data_cols = [ 'x_raw', 'holidayinfo', 'air_store_id2', 'id', 'x', 'air_area_name', 'air_genre_name', 'latitude', 'longitude', 'start_date', 'x_lags', 'xy_lags', ] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.9) self.num_area_name = self.test_df['air_area_name'].max() + 1 self.num_genre_name = self.test_df['air_genre_name'].max() + 1 self.num_store_id = self.test_df['air_store_id2'].max() + 1 print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df)) def train_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.train_df, shuffle=True, num_epochs=1000, mode='train') def val_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.val_df, shuffle=True, num_epochs=1000, mode='val') def test_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.test_df, shuffle=True, num_epochs=1, mode='test') def batch_generator(self, batch_size, df, mode, shuffle=True, num_epochs=1000): batch_gen = df.batch_generator( batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=(mode == 'test')) for batch in batch_gen: num_decode_steps = 39 full_seq_len = batch['x'].shape[1] - num_decode_steps max_encode_length = full_seq_len x = np.zeros([len(batch), max_encode_length]) y = np.zeros([len(batch), num_decode_steps]) x_raw = np.zeros([len(batch), max_encode_length]) x_lags = np.zeros([ len(batch), max_encode_length, batch['x_lags'].shape[2] + batch['xy_lags'].shape[2] ]) y_lags = np.zeros( [len(batch), num_decode_steps, batch['xy_lags'].shape[2]]) x_holi = np.zeros([len(batch), max_encode_length]) y_holi = np.zeros([len(batch), num_decode_steps]) x_len = np.zeros([len(batch)]) y_len = np.zeros([len(batch)]) x_idx = np.zeros([len(batch), max_encode_length]) y_idx = np.zeros([len(batch), num_decode_steps]) y_id = np.zeros([len(batch), num_decode_steps]) #x_ts = np.zeros([len(batch), max_encode_length, batch['ts'].shape[2]]) weights = np.zeros([len(batch)]) weights[:] = 1.0 for i, (data, data_raw, start_idx, x_lag, xy_lag, holi, uid) in enumerate( zip(batch['x'], batch['x_raw'], batch['start_date'], batch['x_lags'], batch['xy_lags'], batch['holidayinfo'], batch['id'])): seq_len = full_seq_len - start_idx val_window = 180 train_window = 180 if mode == 'train': if seq_len == 0: rand_encode_len = 0 weights[i] = 0 elif seq_len <= train_window: rand_encode_len = np.random.randint(0, seq_len) else: rand_encode_len = np.random.randint( seq_len - train_window, seq_len) rand_decode_len = min(seq_len - rand_encode_len, num_decode_steps) elif mode == 'val': if seq_len <= num_decode_steps: rand_encode_len = 0 weights[i] = 0 elif seq_len <= val_window + num_decode_steps: rand_encode_len = np.random.randint( 0, seq_len - num_decode_steps + 1) else: rand_encode_len = np.random.randint( seq_len - (val_window + num_decode_steps), seq_len - num_decode_steps + 1) rand_decode_len = min(seq_len - rand_encode_len, num_decode_steps) elif mode == 'test': rand_encode_len = seq_len rand_decode_len = num_decode_steps end_idx = start_idx + rand_encode_len x[i, :rand_encode_len] = data[start_idx:end_idx] y[i, :rand_decode_len] = data[end_idx:end_idx + rand_decode_len] x_raw[i, :rand_encode_len] = data_raw[start_idx:end_idx] x_lags[i, :rand_encode_len, :x_lag.shape[1]] = x_lag[ start_idx:end_idx, :] x_lags[i, :rand_encode_len, x_lag.shape[1]:] = xy_lag[start_idx:end_idx, :] y_lags[i, :rand_decode_len, :] = xy_lag[end_idx:end_idx + rand_decode_len, :] x_holi[i, :rand_encode_len] = holi[start_idx:end_idx] y_holi[i, :rand_decode_len] = holi[end_idx:end_idx + rand_decode_len] x_idx[i, :rand_encode_len] = np.floor( np.log(np.arange(rand_encode_len) + 1)) y_idx[i, :rand_decode_len] = np.floor( np.log( np.arange(rand_encode_len, rand_encode_len + rand_decode_len) + 1)) y_id[i, :rand_decode_len] = uid[end_idx:end_idx + rand_decode_len] x_len[i] = end_idx - start_idx y_len[i] = rand_decode_len batch['x_'] = batch['x'] batch['x'] = x batch['y'] = y batch['x_raw'] = x_raw batch['x_lags'] = x_lags batch['y_lags'] = y_lags batch['x_holi'] = x_holi batch['y_holi'] = y_holi # batch['x_ts'] = x_ts batch['x_idx'] = x_idx batch['y_idx'] = y_idx batch['y_id'] = y_id batch['x_len'] = x_len batch['y_len'] = y_len # batch['item_class'] = batch['class'] batch['weights'] = weights yield batch
class DataReader(object): def __init__(self, data_dir): data_cols = [ 'user_id', 'product_id', 'aisle_id', 'department_id', 'is_ordered_history', 'index_in_order_history', 'order_dow_history', 'order_hour_history', 'days_since_prior_order_history', 'order_size_history', 'reorder_size_history', 'order_is_weekend_history', 'order_part_of_day_history', 'order_number_history', 'history_length', 'product_name', 'product_name_length', 'eval_set', 'label' ] data = [ np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols ] self.test_df = DataFrame(columns=data_cols, data=data) print(self.test_df.shapes()) print('loaded data') self.train_df, self.val_df = self.test_df.train_test_split( train_size=0.9) print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df)) def train_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.train_df, shuffle=True, num_epochs=10000, is_test=False) def val_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.val_df, shuffle=True, num_epochs=10000, is_test=False) def test_batch_generator(self, batch_size): return self.batch_generator(batch_size=batch_size, df=self.test_df, shuffle=False, num_epochs=1, is_test=True) def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False): batch_gen = df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test) for batch in batch_gen: batch['order_dow_history'] = np.roll(batch['order_dow_history'], -1, axis=1) batch['order_hour_history'] = np.roll(batch['order_hour_history'], -1, axis=1) batch['days_since_prior_order_history'] = np.roll( batch['days_since_prior_order_history'], -1, axis=1) batch['order_is_weekend_history'] = np.roll( batch['order_is_weekend_history'], -1, axis=1) batch['order_part_of_day_history'] = np.roll( batch['order_part_of_day_history'], -1, axis=1) batch['order_number_history'] = np.roll( batch['order_number_history'], -1, axis=1) batch['next_is_ordered'] = np.roll(batch['is_ordered_history'], -1, axis=1) batch['is_none'] = batch['product_id'] == 0 if not is_test: batch['history_length'] = batch['history_length'] - 1 yield batch
class DataReader(object): def __init__(self, data_dir, output_dir, **kwargs): _data_cols = ['x', 'x_len', 'c', 'c_len', 'text'] data_cols = [] # Make sure all of the columns exist for i in _data_cols: if os.path.exists(os.path.join(data_dir, '{}.npy'.format(i))): data_cols.append(i) f = [] for i in data_cols: f.append(np.load(os.path.join(data_dir, '{}.npy'.format(i)))) data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols] self.test_df = DataFrame(columns=data_cols, data=data, **kwargs) self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95, random_state=2018) date_str = datetime.now().strftime('%Y-%m-%d_%H-%M') np.save(Path(output_dir) / date_str, self.val_df.dict) # Load the strokes # np.load("../checkpoints/original/2020-03-13_15-03.npy", allow_pickle=True).item()["x"] print('train size', len(self.train_df)) print('val size', len(self.val_df)) print('test size', len(self.test_df)) def train_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.train_df, shuffle=True, num_epochs=10000, mode='train' ) def val_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.val_df, shuffle=True, num_epochs=10000, mode='val' ) def test_batch_generator(self, batch_size): return self.batch_generator( batch_size=batch_size, df=self.test_df, shuffle=False, num_epochs=1, mode='test' ) def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, mode='train'): gen = df.batch_generator( batch_size=batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=(mode == 'test') ) for batch in gen: batch['x_len'] = batch['x_len'] - 1 max_x_len = np.max(batch['x_len']) max_c_len = np.max(batch['c_len']) batch['y'] = batch['x'][:, 1:max_x_len + 1, :] batch['x'] = batch['x'][:, :max_x_len, :] batch['c'] = batch['c'][:, :max_c_len] yield batch