Esempio n. 1
0
def read_and_translate(filename, destname):
    print('>> reading', filename)
    if READMODE != '.feather':
        df = read_file(filename)
    else:
        df = mlc.load(filename)

    print_memory()
    df = drop_to_save_memory(df)
    print_memory()
    print(df.head(5))

    for feature in df:
        df = desc_missing(df, feature)

    df_translated = df
    for feature in CAT_TRANSLATE:
        print('>> doing', feature)
        df_translated = translate_col_to_en(df_translated, feature)

    mlc.save(
        df_translated,
        destname)  # DataFrames can be saved with ultra-fast feather format.
    del df_translated
    gc.collect()
    df_translated = mlc.load(destname)
    print(df_translated.head())
Esempio n. 2
0
def reduce():
    for name in ['train_active', 'test_active']:
        filename = '../input/{}.csv'.format(name)
        df = pd.read_csv(filename, usecols=['title'])
        dstname = '../input/{}_title.feather'.format(name)
        mlc.save(df, dstname)
        savename = '../input/{}_title_unique.pickle'.format(name)
        unique_element = df['title'].unique()
        with open(savename, 'wb') as handle:
            pickle.dump(unique_element,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)
    def feature_more(self):
        print("loading data...")
        df_train = pd.read_csv(self.section.get('train_file'),
                               dtype=self.tk_dtypes,
                               usecols=self.train_cols,
                               skiprows=self.skip_rows,
                               nrows=self.n_rows,
                               parse_dates=["click_time"])
        df_test = pd.read_csv(self.section.get('test_file'),
                              dtype=self.tk_dtypes,
                              nrows=self.n_rows,
                              usecols=self.test_cols,
                              parse_dates=["click_time"])
        self.len_train = df_train.shape
        self.len_test = df_test.shape
        df_train = df_train.append(df_test)
        del df_test
        gc.collect()
        print("Creating new time features: 'hour' and 'day'...")
        dt = df_train["click_time"].dt
        df_train['hour'] = dt.hour.astype('uint8')
        df_train['day'] = dt.day.astype('uint8')
        df_train['wday'] = dt.dayofweek.astype('uint8')
        del dt
        gc.collect()

        # df_train = df_train.set_index('click_time').sort_index()
        df_train = self.get_agg_features(df=df_train,
                                         agg_cols=GROUPBY_AGGREGATIONS)
        df_train = self.get_click_order(df=df_train,
                                        click_act_dict=HISTORY_CLICKS)
        # df_train = self.get_next_click_time(df=df_train, click_groups=GROUP_BY_NEXT_CLICKS)
        # df_train = self.get_pre_click_time(df=df_train, click_groups=GROUP_BY_NEXT_CLICKS)
        df_train = self.get_roll_features(df=df_train,
                                          roll_cols=ROLLING_BY_TIME)
        print(f'begin to do LabelEncoder for {self.label_cols}')
        df_train[self.label_cols].apply(LabelEncoder().fit_transform)
        df_train = df_train.reset_index('click_time', drop=True)

        print('get train and test dataset')
        data_train = df_train.iloc[:self.len_train[0]]
        data_test = df_train.iloc[self.len_train[0]:]
        del df_train
        gc.collect()
        assert (data_test.shape[0] == self.len_test[0],
                'prepare data error, test data size not eqaul')
        # data_train.to_csv(self.section.get('train_file_tmp'), index=False)
        # data_test.to_csv(self.section.get('test_file_tmp'), index=False)
        mlc.save(data_train, self.section.get('train_file_tmp'))
        mlc.save(data_test, self.section.get('test_file_tmp'))
        return data_train, data_test
def convert_debug_mode_feather():
    for name in ['train', 'test', 'train_active',  'test_active', 
                    'periods_train', 'periods_test', 'train_translated', 
                    'train_active_translated', 'test_translated', 'test_active_translated']:
          for debug in [1,2]:
                    print('----------------------------------------------------------------')
                    dstname = '../input/debug{}/{}_debug{}.csv'.format(debug,name,debug)
                    t_start = time.time()
                    print('>> loading', dstname)
                    df = pd.read_csv(dstname)
                    print('no. of rows:', len(df))
                    print(df.head())
                    t_end = time.time()
                    print('loading time:', t_end-t_start)

                    savename = '../input/debug{}/{}_debug{}.feather'.format(debug,name,debug)
                    print('>> saving to', savename)
                    mlc.save(df, savename)   
                    print('done')  
Esempio n. 5
0
def read_and_translate(filename, destname):
    print('>> reading...')
    df = read_file(filename)
    # df.head(5)

    for feature in df:
        df = desc_missing(df, feature)

    df_translated = df
    for feature in CAT_TRANSLATE:
        print('>> doing', feature)
        df_translated = translate_col_to_en(df_translated, feature)

    mlc.save(
        df_translated,
        destname)  # DataFrames can be saved with ultra-fast feather format.
    del df_translated
    gc.collect()
    df_translated = mlc.load(destname)
    print(df_translated.head())
def translate_textblob():
    print(
            '--------------------------------------------------------------------------'
    )
    debug = DEBUG
    if debug==3:
        debug=2
    name = DATASET
    if debug:
            dstname = '../input/debug{}/{}_debug{}.feather'.format(
                    debug, name, debug)
            savename = '../input/debug{}/{}_textblob_debug{}.feather'.format(
                    debug, name, debug)                    
    else:
            dstname = '../input/{}.feather'.format(name)
            savename = '../input/{}_textblob.feather'.format(name) 


    if os.path.exists(savename):
        print('done already')
    else:                    
        t_start = time.time()
        print('>> loading', dstname)
        df = mlc.load(dstname)
        if DEBUG == 3:
            df = df.sample(frac=0.01)
        print('no. of rows:', len(df))
        t_end = time.time()
        print('loading time:', t_end - t_start)
        print_memory()

        print('>> translating')
        df_translated = map_translate(df)
        print (df_translated.head())
        print (df_translated.tail())

        print('>> saving', savename)
        mlc.save(df_translated, savename)
Esempio n. 7
0
def main():
    global args, debug, NCHUNKS, READMODE
    args = parser.parse_args()
    datasets = args.dataset
    debug = args.debug
    READMODE = args.readmode

    if debug == 2:
        NCHUNKS = 13
    if debug == 1:
        NCHUNKS = 130
    if debug == 0:
        NCHUNKS = 2000

    print('summary: debug {}, chunks {}'.format(debug, NCHUNKS))

    if datasets == 'all':
        datasets = ['train', 'test', 'train_active', 'test_active']
    else:
        datasets = [datasets]

    for which_dataset in datasets:
        filename = '../input/' + which_dataset + READMODE
        read_and_build_map(filename, which_dataset)
        df_translated = read_and_translate(filename, which_dataset)
        destname = '../input/' + which_dataset + '_translated.feather'
        print('>> saving to ...', destname)
        mlc.save(df_translated, destname)
        del df_translated
        gc.collect()
        print('>> loading ...', destname)
        df_translated = mlc.load(destname)
        print(df_translated.head())
        df2 = df_translated.sample(frac=0.01)
        print(df2.head(5))
        print(df2.tail(5))
def save_feather(df, filename):
    print_doing('saving to {}'.format(filename))
    mlc.save(df, filename)
Esempio n. 9
0
all_image_lst = []

#This loop is for getting file sizes and a list of all image names
print('Getting image sizes...')
try:
    img_sizes, all_image_lst = mlc.load('cache/image_sizes.pkl')
    print('Loaded image sizes from cache')
except FileNotFoundError:
    for _, (images, labels, file_locs) in enumerate(tqdm(img_batch_keep)):
        for file_loc in file_locs:
            size2 = lycon.load(file_loc).shape
            new_size = torch.Size([1, 3, *size2])
            img_sizes[file_loc] = new_size
            all_image_lst.append(file_loc)

    mlc.save([img_sizes, all_image_lst], 'cache/image_sizes.pkl')

print('{} images loaded'.format(len(all_image_lst)))

#This randomly takes 50 images for the "test set".
test_images = random.sample(all_image_lst, 50)

del kuzu_keep
del img_batch_keep

# initiate Generator
#This initializes the generator object

generator = nn.DataParallel(UnetGenerator(3, kuzu_targets.num_characters, 64),
                            device_ids=[i
                                        for i in range(args.num_gpu)]).cuda()
Esempio n. 10
0
                required=True,
                help="Path to the directory that contains the images")
ap.add_argument("-n",
                "--descriptor",
                required=True,
                help="descriptor = SURF, SIFT or  ORB")
ap.add_argument("-o",
                "--output",
                required=True,
                help="Path to where the computed descriptors will be stored")
ap.add_argument("-t",
                "--threads",
                default=1,
                help="Number of threads to use for descriptor extraction")
args = vars(ap.parse_args())

#reading arguments
path = args["dataset"]
descriptorName = args["descriptor"]
output = args["output"]
threads = int(args["threads"])

#computing the descriptors
dict = {"SURF": describeSURF, "SIFT": describeSIFT, "ORB": describeORB}
descriptors = getDescriptors(path, dict[descriptorName], threads)

print('Writing descriptors to disk ...')

# Write descriptors to disk
mlc.save(descriptors, output + '.pkl')
def convert_to_feather(filename, dstname):
    print('>> reading', filename)
    df = pd.read_csv(filename)
    print('no. of rows of filename:', len(df))
    print('>> saving to ...', dstname)
    mlc.save(df, dstname)
Esempio n. 12
0
        parser.add_argument("--ime",
                            type=str,
                            choices=enhanceMethods,
                            help='Preprocess image with this method')
    else:
        print("Enhancement not available; install im_enhance if you want")

    cmdArgs = parser.parse_args()

    queryBag = ImageBag(cmdArgs.bagfile, cmdArgs.topic)
    mapsource = None

    mapsource = GenericImageDatabase(cmdArgs.mapfile)

    if hasattr(cmdArgs, 'ime') and (cmdArgs.ime is not None):
        from place_recognizer.GenericImageMap import ime
        mapsource.useEnhancement = True
        mapsource.enhanceMethod = eval('ime.' + prog_arguments.ime)
    else:
        imeMethod = False

    bagLock = Lock()
    orb = cv2.ORB_create(6000)

    print("Ready")
    pool4 = mlc.SuperPool(n_cpu=1)
    samples = queryBag.desample(-1, True, cmdArgs.start, cmdArgs.stop)
    positions = pool4.map(processQuery, samples)
    mlc.save(positions, cmdArgs.output)
    print("Done")
Esempio n. 13
0
                required=True,
                help="Path to the directory that contains the images")
ap.add_argument("-n",
                "--descriptor",
                required=True,
                help="descriptor = SURF, SIFT or  ORB")
ap.add_argument("-o",
                "--output",
                required=True,
                help="Path to where the computed descriptors will be stored")
ap.add_argument("-t",
                "--threads",
                default=1,
                help="Number of threads to use for descriptor extraction")
args = vars(ap.parse_args())

#reading arguments
path = args["dataset"]
descriptorName = args["descriptor"]
output = args["output"]
threads = int(args["threads"])

#computing the descriptors
dict = {"SURF": describeSURF, "SIFT": describeSIFT, "ORB": describeORB}
descriptors = getDescriptors(path, dict[descriptorName], threads)

print('Writing descriptors to disk ...')

# Write descriptors to disk
mlc.save(descriptors, output + '.pickle')