def read_and_translate(filename, destname): print('>> reading', filename) if READMODE != '.feather': df = read_file(filename) else: df = mlc.load(filename) print_memory() df = drop_to_save_memory(df) print_memory() print(df.head(5)) for feature in df: df = desc_missing(df, feature) df_translated = df for feature in CAT_TRANSLATE: print('>> doing', feature) df_translated = translate_col_to_en(df_translated, feature) mlc.save( df_translated, destname) # DataFrames can be saved with ultra-fast feather format. del df_translated gc.collect() df_translated = mlc.load(destname) print(df_translated.head())
def reduce(): for name in ['train_active', 'test_active']: filename = '../input/{}.csv'.format(name) df = pd.read_csv(filename, usecols=['title']) dstname = '../input/{}_title.feather'.format(name) mlc.save(df, dstname) savename = '../input/{}_title_unique.pickle'.format(name) unique_element = df['title'].unique() with open(savename, 'wb') as handle: pickle.dump(unique_element, handle, protocol=pickle.HIGHEST_PROTOCOL)
def feature_more(self): print("loading data...") df_train = pd.read_csv(self.section.get('train_file'), dtype=self.tk_dtypes, usecols=self.train_cols, skiprows=self.skip_rows, nrows=self.n_rows, parse_dates=["click_time"]) df_test = pd.read_csv(self.section.get('test_file'), dtype=self.tk_dtypes, nrows=self.n_rows, usecols=self.test_cols, parse_dates=["click_time"]) self.len_train = df_train.shape self.len_test = df_test.shape df_train = df_train.append(df_test) del df_test gc.collect() print("Creating new time features: 'hour' and 'day'...") dt = df_train["click_time"].dt df_train['hour'] = dt.hour.astype('uint8') df_train['day'] = dt.day.astype('uint8') df_train['wday'] = dt.dayofweek.astype('uint8') del dt gc.collect() # df_train = df_train.set_index('click_time').sort_index() df_train = self.get_agg_features(df=df_train, agg_cols=GROUPBY_AGGREGATIONS) df_train = self.get_click_order(df=df_train, click_act_dict=HISTORY_CLICKS) # df_train = self.get_next_click_time(df=df_train, click_groups=GROUP_BY_NEXT_CLICKS) # df_train = self.get_pre_click_time(df=df_train, click_groups=GROUP_BY_NEXT_CLICKS) df_train = self.get_roll_features(df=df_train, roll_cols=ROLLING_BY_TIME) print(f'begin to do LabelEncoder for {self.label_cols}') df_train[self.label_cols].apply(LabelEncoder().fit_transform) df_train = df_train.reset_index('click_time', drop=True) print('get train and test dataset') data_train = df_train.iloc[:self.len_train[0]] data_test = df_train.iloc[self.len_train[0]:] del df_train gc.collect() assert (data_test.shape[0] == self.len_test[0], 'prepare data error, test data size not eqaul') # data_train.to_csv(self.section.get('train_file_tmp'), index=False) # data_test.to_csv(self.section.get('test_file_tmp'), index=False) mlc.save(data_train, self.section.get('train_file_tmp')) mlc.save(data_test, self.section.get('test_file_tmp')) return data_train, data_test
def convert_debug_mode_feather(): for name in ['train', 'test', 'train_active', 'test_active', 'periods_train', 'periods_test', 'train_translated', 'train_active_translated', 'test_translated', 'test_active_translated']: for debug in [1,2]: print('----------------------------------------------------------------') dstname = '../input/debug{}/{}_debug{}.csv'.format(debug,name,debug) t_start = time.time() print('>> loading', dstname) df = pd.read_csv(dstname) print('no. of rows:', len(df)) print(df.head()) t_end = time.time() print('loading time:', t_end-t_start) savename = '../input/debug{}/{}_debug{}.feather'.format(debug,name,debug) print('>> saving to', savename) mlc.save(df, savename) print('done')
def read_and_translate(filename, destname): print('>> reading...') df = read_file(filename) # df.head(5) for feature in df: df = desc_missing(df, feature) df_translated = df for feature in CAT_TRANSLATE: print('>> doing', feature) df_translated = translate_col_to_en(df_translated, feature) mlc.save( df_translated, destname) # DataFrames can be saved with ultra-fast feather format. del df_translated gc.collect() df_translated = mlc.load(destname) print(df_translated.head())
def translate_textblob(): print( '--------------------------------------------------------------------------' ) debug = DEBUG if debug==3: debug=2 name = DATASET if debug: dstname = '../input/debug{}/{}_debug{}.feather'.format( debug, name, debug) savename = '../input/debug{}/{}_textblob_debug{}.feather'.format( debug, name, debug) else: dstname = '../input/{}.feather'.format(name) savename = '../input/{}_textblob.feather'.format(name) if os.path.exists(savename): print('done already') else: t_start = time.time() print('>> loading', dstname) df = mlc.load(dstname) if DEBUG == 3: df = df.sample(frac=0.01) print('no. of rows:', len(df)) t_end = time.time() print('loading time:', t_end - t_start) print_memory() print('>> translating') df_translated = map_translate(df) print (df_translated.head()) print (df_translated.tail()) print('>> saving', savename) mlc.save(df_translated, savename)
def main(): global args, debug, NCHUNKS, READMODE args = parser.parse_args() datasets = args.dataset debug = args.debug READMODE = args.readmode if debug == 2: NCHUNKS = 13 if debug == 1: NCHUNKS = 130 if debug == 0: NCHUNKS = 2000 print('summary: debug {}, chunks {}'.format(debug, NCHUNKS)) if datasets == 'all': datasets = ['train', 'test', 'train_active', 'test_active'] else: datasets = [datasets] for which_dataset in datasets: filename = '../input/' + which_dataset + READMODE read_and_build_map(filename, which_dataset) df_translated = read_and_translate(filename, which_dataset) destname = '../input/' + which_dataset + '_translated.feather' print('>> saving to ...', destname) mlc.save(df_translated, destname) del df_translated gc.collect() print('>> loading ...', destname) df_translated = mlc.load(destname) print(df_translated.head()) df2 = df_translated.sample(frac=0.01) print(df2.head(5)) print(df2.tail(5))
def save_feather(df, filename): print_doing('saving to {}'.format(filename)) mlc.save(df, filename)
all_image_lst = [] #This loop is for getting file sizes and a list of all image names print('Getting image sizes...') try: img_sizes, all_image_lst = mlc.load('cache/image_sizes.pkl') print('Loaded image sizes from cache') except FileNotFoundError: for _, (images, labels, file_locs) in enumerate(tqdm(img_batch_keep)): for file_loc in file_locs: size2 = lycon.load(file_loc).shape new_size = torch.Size([1, 3, *size2]) img_sizes[file_loc] = new_size all_image_lst.append(file_loc) mlc.save([img_sizes, all_image_lst], 'cache/image_sizes.pkl') print('{} images loaded'.format(len(all_image_lst))) #This randomly takes 50 images for the "test set". test_images = random.sample(all_image_lst, 50) del kuzu_keep del img_batch_keep # initiate Generator #This initializes the generator object generator = nn.DataParallel(UnetGenerator(3, kuzu_targets.num_characters, 64), device_ids=[i for i in range(args.num_gpu)]).cuda()
required=True, help="Path to the directory that contains the images") ap.add_argument("-n", "--descriptor", required=True, help="descriptor = SURF, SIFT or ORB") ap.add_argument("-o", "--output", required=True, help="Path to where the computed descriptors will be stored") ap.add_argument("-t", "--threads", default=1, help="Number of threads to use for descriptor extraction") args = vars(ap.parse_args()) #reading arguments path = args["dataset"] descriptorName = args["descriptor"] output = args["output"] threads = int(args["threads"]) #computing the descriptors dict = {"SURF": describeSURF, "SIFT": describeSIFT, "ORB": describeORB} descriptors = getDescriptors(path, dict[descriptorName], threads) print('Writing descriptors to disk ...') # Write descriptors to disk mlc.save(descriptors, output + '.pkl')
def convert_to_feather(filename, dstname): print('>> reading', filename) df = pd.read_csv(filename) print('no. of rows of filename:', len(df)) print('>> saving to ...', dstname) mlc.save(df, dstname)
parser.add_argument("--ime", type=str, choices=enhanceMethods, help='Preprocess image with this method') else: print("Enhancement not available; install im_enhance if you want") cmdArgs = parser.parse_args() queryBag = ImageBag(cmdArgs.bagfile, cmdArgs.topic) mapsource = None mapsource = GenericImageDatabase(cmdArgs.mapfile) if hasattr(cmdArgs, 'ime') and (cmdArgs.ime is not None): from place_recognizer.GenericImageMap import ime mapsource.useEnhancement = True mapsource.enhanceMethod = eval('ime.' + prog_arguments.ime) else: imeMethod = False bagLock = Lock() orb = cv2.ORB_create(6000) print("Ready") pool4 = mlc.SuperPool(n_cpu=1) samples = queryBag.desample(-1, True, cmdArgs.start, cmdArgs.stop) positions = pool4.map(processQuery, samples) mlc.save(positions, cmdArgs.output) print("Done")
required=True, help="Path to the directory that contains the images") ap.add_argument("-n", "--descriptor", required=True, help="descriptor = SURF, SIFT or ORB") ap.add_argument("-o", "--output", required=True, help="Path to where the computed descriptors will be stored") ap.add_argument("-t", "--threads", default=1, help="Number of threads to use for descriptor extraction") args = vars(ap.parse_args()) #reading arguments path = args["dataset"] descriptorName = args["descriptor"] output = args["output"] threads = int(args["threads"]) #computing the descriptors dict = {"SURF": describeSURF, "SIFT": describeSIFT, "ORB": describeORB} descriptors = getDescriptors(path, dict[descriptorName], threads) print('Writing descriptors to disk ...') # Write descriptors to disk mlc.save(descriptors, output + '.pickle')