def read_and_translate(filename, destname): print('>> reading', filename) if READMODE != '.feather': df = read_file(filename) else: df = mlc.load(filename) print_memory() df = drop_to_save_memory(df) print_memory() print(df.head(5)) for feature in df: df = desc_missing(df, feature) df_translated = df for feature in CAT_TRANSLATE: print('>> doing', feature) df_translated = translate_col_to_en(df_translated, feature) mlc.save( df_translated, destname) # DataFrames can be saved with ultra-fast feather format. del df_translated gc.collect() df_translated = mlc.load(destname) print(df_translated.head())
def prepare_data(self): if os.path.exists( self.section.get('train_file_tmp')) and os.path.exists( self.section.get('test_file_tmp')): train_data = mlc.load(self.section.get('train_file_tmp')) test_data = mlc.load(self.section.get('test_file_tmp')) else: train_data, test_data = self.feature_more() train_data = train_data.drop('ip', axis=1) self.len_train = train_data.shape self.len_test = test_data.shape self.sample_cols = train_data.columns.values[sample_match( train_data.columns.values)].tolist() # self.embedding_cols = train_data.columns.difference(self.sample_cols).difference([self.label]).tolist() self.embedding_cols = train_data.columns.difference( [self.label, 'click_id']).tolist() for col in self.embedding_cols: self.col_max[col] = max(train_data[col].max(), test_data[col].max()) + 1 # self.col_max['hour'] = 24 # self.col_max['day'] = 31 # self.col_max['wday'] = 7 return train_data, test_data
def check_if_all_translated(): for name in ['train_translated', 'train_active_translated', 'test_translated', 'test_active_translated']: print('--------------------------------------------------------------------------') debug = DEBUG if debug: dstname = '../input/debug{}/{}_debug{}.feather'.format(debug,name,debug) else: dstname = '../input/{}.feather'.format(name) t_start = time.time() print('>> loading', dstname) df = mlc.load(dstname) print('no. of rows:', len(df)) t_end = time.time() print('loading time:', t_end-t_start) for feature in CAT_TRANSLATE: print('>> doing', feature) list_not_translated = [] count_not_translated = 0 for index, row in df.iterrows(): if index%100000==0: print(index,'/',len(df)) if row[feature] == row[feature + '_en']: count_not_translated = count_not_translated + 1 list_not_translated = list_not_translated + [row[feature]] print('feature {} not translated {}/{}'.format(feature,count_not_translated,len(df))) list_not_translated_unique, count_not_translated_unique = find_unique_element_and_count(list_not_translated) print('list not translated', list_not_translated_unique) print('count not translated', count_not_translated_unique)
def write_debug_mode_csv(): for name in ['train', 'test', 'train_active', 'test_active', 'periods_train', 'periods_test', 'train_translated', 'train_active_translated', 'test_translated', 'test_active_translated']: for debug in [1,2]: print('----------------------------------------------------------------') dstname = '../input/{}.feather'.format(name) t_start = time.time() print('>> loading', dstname) df = mlc.load(dstname) print('no. of rows:', len(df)) print(df.head()) # del df; gc.collect() t_end = time.time() print('loading time:', t_end-t_start) savename = '../input/debug{}/{}_debug{}.csv'.format(debug,name,debug) if debug == 1: df_extracted = df.sample(frac=0.1, random_state = SEED) else: df_extracted = df.sample(frac=0.001, random_state = SEED) print('no. of rows:', len(df_extracted)) print('>> saving to', savename) df_extracted.to_csv(savename,index=False) print('done')
def test_load_time(): for name in [ 'train', 'test', 'train_active', 'test_active', 'periods_train', 'periods_test', 'train_translated', 'train_active_translated', 'test_translated', 'test_active_translated' ]: print( '----------------------------------------------------------------') dstname = '../input/{}.feather'.format(name) t_start = time.time() print('>> loading', dstname) df = mlc.load(dstname) print('no. of rows:', len(df)) print(df.head()) del df gc.collect() t_end = time.time() print('loading time:', t_end - t_start)
def read_and_translate(filename, which_dataset): print('>> reading', filename) if READMODE != '.feather': df = read_file(filename) else: df = mlc.load(filename) # for feature in df: # df = desc_missing(df,feature) df_translated = df for feature in CAT_TRANSLATE: print('>> translating', feature) dstname = '../dict/dict_ru_to_en_{}_{}.pickle'.format( which_dataset, feature) map_dict = pickle.load(open(dstname, "rb")) # map_dict['n/a'] = 'n/a' new_feature = feature + '_en' df_translated[new_feature] = df[feature].apply(lambda x: map_dict[x]) return df_translated
def read_and_translate(filename, destname): print('>> reading...') df = read_file(filename) # df.head(5) for feature in df: df = desc_missing(df, feature) df_translated = df for feature in CAT_TRANSLATE: print('>> doing', feature) df_translated = translate_col_to_en(df_translated, feature) mlc.save( df_translated, destname) # DataFrames can be saved with ultra-fast feather format. del df_translated gc.collect() df_translated = mlc.load(destname) print(df_translated.head())
def read_and_build_dict(filename, which_dataset, from_iloc): print('>> reading', filename) if READMODE != '.feather': df = read_file(filename) else: df = mlc.load(filename) print_memory() df = drop_to_save_memory(df) print_memory() print(df.head(5)) for feature in df: df = desc_missing(df, feature) df_translated = df for feature in CAT_TRANSLATE: print('>> doing', feature) translate_col_and_save(df_translated, feature, which_dataset, from_iloc)
def read_and_build_map(filename, which_dataset): print('>> reading', filename) if READMODE != '.feather': df = read_file(filename) else: df = mlc.load(filename) # for feature in df: # df = desc_missing(df,feature) for feature in CAT_TRANSLATE: map_dict = dict() dstname = '../dict/dict_ru_to_en_{}_{}.pickle'.format( which_dataset, feature) print('>> doing', feature) unique_element = df[feature].unique() num_split = len(range(0, len(unique_element), NCHUNKS)) print_memory() is_cont = True if os.path.exists(dstname): print('done already') else: for k in range(num_split): if is_cont: savename = '../dict_part/translated_{}_{}_{}.pickle'.format( which_dataset, feature, k) if os.path.exists(savename): print('loading', savename) map_temp = pickle.load(open(savename, "rb")) print('updating map') map_dict.update(map_temp) else: print('missing', savename, '. Please check!!') is_cont = False if is_cont: print('saving final dict to', dstname) with open(dstname, 'wb') as handle: pickle.dump(map_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
def translate_textblob(): print( '--------------------------------------------------------------------------' ) debug = DEBUG if debug==3: debug=2 name = DATASET if debug: dstname = '../input/debug{}/{}_debug{}.feather'.format( debug, name, debug) savename = '../input/debug{}/{}_textblob_debug{}.feather'.format( debug, name, debug) else: dstname = '../input/{}.feather'.format(name) savename = '../input/{}_textblob.feather'.format(name) if os.path.exists(savename): print('done already') else: t_start = time.time() print('>> loading', dstname) df = mlc.load(dstname) if DEBUG == 3: df = df.sample(frac=0.01) print('no. of rows:', len(df)) t_end = time.time() print('loading time:', t_end - t_start) print_memory() print('>> translating') df_translated = map_translate(df) print (df_translated.head()) print (df_translated.tail()) print('>> saving', savename) mlc.save(df_translated, savename)
def convert_to_pickle(): for name in ['train_active_translated', 'test_active_translated']: filename = '../input/{}.feather'.format(name) print( '----------------------------------------------------------------') print('>> loading', filename) t_start = time.time() df = mlc.load(filename) t_end = time.time() print('loading time feather:', t_end - t_start) print('no. of rows:', len(df)) dstname = '../input/{}.pickle'.format(name) print('>> saving', dstname) with open(dstname, 'wb') as handle: pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL) del df gc.collect() print('>> loading', dstname) t_start = time.time() df = pickle.load(open(dstname, "rb")) t_end = time.time() print('loading time pickle:', t_end - t_start) print('no. of rows:', len(df))
def main(): global args, debug, NCHUNKS, READMODE args = parser.parse_args() datasets = args.dataset debug = args.debug READMODE = args.readmode if debug == 2: NCHUNKS = 13 if debug == 1: NCHUNKS = 130 if debug == 0: NCHUNKS = 2000 print('summary: debug {}, chunks {}'.format(debug, NCHUNKS)) if datasets == 'all': datasets = ['train', 'test', 'train_active', 'test_active'] else: datasets = [datasets] for which_dataset in datasets: filename = '../input/' + which_dataset + READMODE read_and_build_map(filename, which_dataset) df_translated = read_and_translate(filename, which_dataset) destname = '../input/' + which_dataset + '_translated.feather' print('>> saving to ...', destname) mlc.save(df_translated, destname) del df_translated gc.collect() print('>> loading ...', destname) df_translated = mlc.load(destname) print(df_translated.head()) df2 = df_translated.sample(frac=0.01) print(df2.head(5)) print(df2.tail(5))
def load_feather(filename): print_doing('read {}'.format(filename)) df = mlc.load(filename) return df
num_workers=2) kuzu_keep = ImageFolder( kuzu, transform=torchvision.transforms.Compose(transforms_keep)) img_batch_keep = torch.utils.data.DataLoader(kuzu_keep, batch_size=batch_size, shuffle=True, num_workers=2) img_sizes = {} all_image_lst = [] #This loop is for getting file sizes and a list of all image names print('Getting image sizes...') try: img_sizes, all_image_lst = mlc.load('cache/image_sizes.pkl') print('Loaded image sizes from cache') except FileNotFoundError: for _, (images, labels, file_locs) in enumerate(tqdm(img_batch_keep)): for file_loc in file_locs: size2 = lycon.load(file_loc).shape new_size = torch.Size([1, 3, *size2]) img_sizes[file_loc] = new_size all_image_lst.append(file_loc) mlc.save([img_sizes, all_image_lst], 'cache/image_sizes.pkl') print('{} images loaded'.format(len(all_image_lst))) #This randomly takes 50 images for the "test set". test_images = random.sample(all_image_lst, 50)