def save_feather(): train = utils.load_csv() for c in tqdm(train.columns): train[[c]].to_feather(f'features/train/{c}.ftr') test = utils.load_csv(test=True) for c in tqdm(test.columns): test[[c]].to_feather(f'features/test/{c}.ftr')
def main(clf): x_train, y_train = utils.load_csv(POSTURES_TRAINING_DATA, feature_idx=range(2, 39), label_idx=1) x_test, y_test = utils.load_csv(POSTURES_TEST_DATA, feature_idx=range(2, 39), label_idx=1) model = models.PosturesEstimator(clf, aggregation = True, knn_n_neighbors = 9, \ svm_c = 10 ** 0.6, svm_gamma = 10 ** -2.0, \ nn_validation = True) assert len(x_train) == len(y_train) and len(x_test) == len( y_test), 'Data size not match' model.fit(x_train, y_train) acc_train = model.evaluate(x_train, y_train, cm='training.png') LOGGER.info('Training Accuracy: %.6f' % acc_train) acc_cv = model.cross_validate(x_train, y_train) if acc_cv is not None: LOGGER.info('Cross Validation Accuracy: %.6f ± %.6f' % (np.mean(acc_cv), np.std(acc_cv))) acc_test = model.evaluate(x_test, y_test, cm='test.png') LOGGER.info('Test Accuracy: %.6f' % acc_test)
def __init__(self, config=cfg, cache=True): if not cache or not os.path.isfile(cfg.data_cache): self.train, self.val = self.train_val_split( utils.load_csv(cfg.train_csv), 0.9) self.test = utils.load_csv(cfg.test_csv, shuffle=False) utils.save_cache([self.train, self.val, self.test], cfg.data_cache) else: self.train, self.val, self.test = utils.load_cache(cfg.data_cache)
def main(): exp_name = f'baseline_{now()}' device, log, result_dir = setup(exp_name, conf) train_df = load_csv(conf.train_csv) if conf.npy: train_images = np.load(conf.train_images) else: train_images = pd.read_parquet(conf.train_images) test_df = load_csv(conf.test_csv) if conf.npy: test_images = np.load(conf.test_images) else: test_images = pd.read_parquet(conf.test_images) log.info('done') for i in range(5): if i != conf.fold: continue if "resnet" in conf.arch or "resnext" in conf.arch: model_ft = ResNet(conf, arch_name=conf.arch, input_size=conf.image_size) model_ft.load_state_dict( torch.load("result/baseline_2020_03_21_13_01_08/model_0.pkl")) elif "densenet" in conf.arch: model_ft = DenseNet(conf, arch_name=conf.arch, input_size=conf.image_size) elif "efficientnet" in conf.arch: model_ft = EfficientNet(conf, arch_name=conf.arch) criterion = [ nn.CrossEntropyLoss(reduction="none"), nn.CrossEntropyLoss(reduction="none"), nn.CrossEntropyLoss(reduction="none") ] criterion = [c.to(device) for c in criterion] model_ft, val_preds = train_model(train_df, train_images, test_df, test_images, model_ft, criterion, log, device, result_dir, fold=i, num_epoch=conf.num_epoch) torch.save(model_ft.state_dict(), result_dir / f'model_{i}.pkl') np.save(result_dir / f'val_preds_{i}.npy', val_preds)
def load_subset(self, dataset, name): filename = os.path.join("..", "datasets", dataset, name, "features.csv") data = np.asarray(utils.load_csv(filename, skiprows=1)) filename = os.path.join("..", "datasets", dataset, name, "y_{}.txt".format(name)) activities = np.asarray(utils.load_csv(filename)).ravel() filename = os.path.join("..", "datasets", dataset, name, "subject_{}.txt".format(name)) subjects = np.asarray(utils.load_csv(filename)).ravel() return data, activities, subjects
def train(train_file, test_file=None): data = utils.load_csv(train_file) feature_set = [(utils.feature_extract(i[0]), i[1]) for i in data] print 'Training' classifier = nltk.NaiveBayesClassifier.train(feature_set) utils.save_model(classifier) print 'Done Training' if test_file: data = utils.load_csv(test_file) test_feature_set = [(utils.feature_extract(i[0]), i[1]) for i in data] print 'Accuracy of model is {0}'.format( nltk.classify.accuracy(classifier, test_feature_set))
def load_artist_data(self, artist_name): artist_information = self._musicbrainz_searcher.get_musicbrainz_artist_info( artist_name) print('Load artist data:', artist_information) events_df = utils.load_csv(artist_information.name, 'events') setlists_df = utils.load_csv(artist_information.name, 'setlists') recordings_df = utils.load_csv(artist_information.name, 'recordings') if events_df is not None: events_df['eventdate'] = pd.to_datetime(events_df['eventdate'], format='%Y-%m-%d') if recordings_df is not None: recordings_df['date'] = pd.to_datetime(recordings_df['date'], format='%Y-%m-%d') return ArtistData(artist_information, events_df, setlists_df, recordings_df)
def get_test_error(threshold): TP = 0 TN = 0 FP = 0 FN = 0 for i in range(test_data_size): data = utils.load_csv(test_set_dir + str(i) + ".csv") label = utils.get_label_from_model_with_threshold( data, model, threshold) if label == 1: if data[19][0] == 1: TP += 1 elif data[19][0] == -1: FP += 1 else: print('1 error') elif label == -1: if data[19][0] == 1: FN += 1 elif data[19][0] == -1: TN += 1 else: print('1 error') else: print(label) TPR = TP / (TP + FN) FPR = FP / (FP + TN) return TPR, FPR
def get_cleaned_tweets(query_dict): """ Get cleaned tweets :param query_dict: query_string: 'datacamp lang:en' time_since: '2019-03-01' time_until: '2019-05-01' max_tweets: 0 for unlimited :return: dataframe """ file_name = _convert_query_dict_to_str_as_filename(query_dict) save_cleaned_file_name = paths.cleaned_tweets / 'cleaned_{}.csv'.format( file_name) if save_cleaned_file_name.is_file(): print('Cleaned file {} already exists, reload'.format( save_cleaned_file_name)) tweet_df = load_csv(save_cleaned_file_name) else: tweet_df = get_raw_tweets(query_dict) print('Cleaning tweets') cleaned_tweet_df = _clean_tweets_text(tweet_df) #print('Select only {USE_TWEETS_COLS} and save tweets to: {repr(save_cleaned_file_name)}'.format()) cleaned_tweet_df[USE_TWEETS_COLS].to_csv(save_cleaned_file_name, index=False) print('Done getting tweets.') return tweet_df
def get_raw_tweets(query_dict): """ Get raw tweets :param query_dict: query_string: 'datacamp lang:en' time_since: '2019-03-01' time_until: '2019-05-01' max_tweets: 0 for unlimited :return: dataframe """ file_name = _convert_query_dict_to_str_as_filename(query_dict) save_raw_file_name = paths.raw_tweets / 'raw_{}.csv'.format(file_name) if save_raw_file_name.is_file(): print('Raw file {} already exists, reload'.format( repr(save_raw_file_name))) tweet_df = load_csv(save_raw_file_name) else: _validate_query(query_dict) print(f'Getting raw tweets with query:\n{query_dict!r}') tweet_criteria = _create_search_criteria(**query_dict) tweet_objects = _get_tweet_object(tweet_criteria) tweet_df = _convert_tweets_to_dataframe(tweet_objects) print(f'Saving raw tweets to: {repr(save_raw_file_name)}') tweet_df.to_csv(save_raw_file_name, index=False) print('Done getting raw tweets.') return tweet_df
def __init__(self, generator, discriminator, data_set_file, y_dim): self.generator = generator self.discriminator = discriminator self.data_set_file = data_set_file self.y_dim = y_dim # useless condition indexs, latitude, longitude = ut.load_csv(self.data_set_file, 2) self.borders = range(B, len(latitude), B) self.Generator_input = 100 self.Generator_output = len(self.borders) self.G_in = tf.placeholder(tf.float32, [None, self.Generator_input]) self.real_partition = tf.placeholder(tf.float32, [None, self.Generator_output], name='real_in') self.condition = tf.placeholder(tf.float32, shape=[None, self.Generator_output]) self.G_out = self.generator(concat(self.G_in, self.condition), self.Generator_output) self.D_real, _ = self.discriminator(self.real_partition, self.Generator_output) self.D_fake, self.Q_fake = self.discriminator(self.G_out, self.Generator_output, reuse = True) self.D_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.D_real, labels=tf.ones_like(self.D_real))) self.G_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.D_fake, labels=tf.ones_like(self.D_fake))) self.Q_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Q_fake, labels=self.real_partition)) self.train_D = tf.train.AdamOptimizer(LR_D).minimize(self.D_loss,var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.discriminator.name)) self.train_G = tf.train.AdamOptimizer(LR_G).minimize(self.G_loss,var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.generator.name)) self.train_Q = tf.train.AdamOptimizer(LR_G).minimize(self.G_loss,var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.generator.name) + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.discriminator.name))
def __init__(self): loaded_csv = utils.load_csv('../../Data/Train/NHID2013.csv') # print("loaded_csv",loaded_csv) # print("loaded_csv",loaded_csv.shape) # (3081208, 6) min_max_scaler = preprocessing.MinMaxScaler() loaded_csv_minmax = pd.DataFrame( min_max_scaler.fit_transform(loaded_csv.iloc[:, :4])) # print(loaded_csv_minmax) # print("loaded_csv_minmax",loaded_csv_minmax.shape) # loaded_csv_minmax (234426, 6) loaded_csv_minmax = pd.concat( [loaded_csv_minmax, loaded_csv.iloc[:, 4:]], axis=1) # print("loaded_csv_minmax",loaded_csv_minmax) self.train_X = np.array(loaded_csv_minmax.iloc[:3000000, :4]) self.train_y = np.array(loaded_csv_minmax.iloc[:3000000, 4:]) # self.train_X=np.array(loaded_csv_minmax.iloc[:3000,:4]) # self.train_y=np.array(loaded_csv_minmax.iloc[:3000,4:]) # print("train_X",train_X.shape) # print("train_y",train_y.shape) self.number_of_data = self.train_X.shape[0]
def run(self): global total_removed if self.path is None: return print("%s starts..." % self.name) content = load_csv(self.path, select_list=[3,4,5,6]) # str2float content = list(map(map_str2float, content)) # gcj2wgs content = list(map(map_list, content)) n = remove_baddata(content) lock.acquire() total_removed += n lock.release() content = np.array(content) content[:,[0,2]] -= 103.5 content[:,[0,2]] /= 0.1 content[:,[1,3]] -= 30.3 content[:,[1,3]] /= 0.05 content = list(map(map_float2int, content)) tem_dis = np.zeros(self.distribution.shape) for row in content: tem_dis[row[0], row[1]] += 1 tem_dis[row[2], row[3]] += 1 # update lock.acquire() self.distribution += tem_dis lock.release() print("%s finished! There are %d removed." %(self.name, n))
def get_angles_data(input_folder, output_folder, files_keep, type_data="angles", align=True): files_keep_clean = [file_name.split(".")[0] for file_name in files_keep] files_angles = get_files(join_path(input_folder, type_data)) if align: files_events = get_files(join_path(input_folder, "events")) os.makedirs(join_path(output_folder, type_data), exist_ok=True) for file_ in files_angles: if file_.split(".")[0] in files_keep_clean: data = np.load(join_path(input_folder, type_data, file_), allow_pickle=True) if len(data.shape) == 3: if np.count_nonzero(np.isnan(data)): continue else: continue np.save(join_path(output_folder, type_data, file_), data) if align: events = load_csv( join_path(input_folder, "events", "{}.csv".format(file_.split(".")[0])), dtype=str, ) align_and_save_data(data, events, output_folder, file_, type_data=type_data)
def get_feats_from_csv_in_partitions(): """ Extract the original features that are distributed in the dataset. Features are splitted according with the config.yaml file. """ conf = utils.get_config() rows = [ row for row in utils.load_csv() if utils.check_filter(row, conf['filters']) ] train_rows, valid_rows, test_rows = utils.split_dataset( rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed']) X_train, y_train, X_valid, y_valid, X_test, y_test = [], [], [], [], [], [] prefixes = ['t_', 'i_', 's_'] # Feature names' prefixes datasets = [(X_train, y_train, train_rows), (X_test, y_test, test_rows), (X_valid, y_valid, valid_rows)] out = [] for X, y, rows in datasets: for row in rows: X.append([ float(v) for k, v in row.iteritems() if len(filter(k.startswith, prefixes)) > 0 ]) y.append(int(row['classification'] == 'Malign')) out.extend((np.asarray(X), np.asarray(y))) return out
def get_feats_in_partitions(): """ Extracts features from all dataset and split them in train validation and test sets """ conf = utils.get_config() paths = utils.get_paths() rows = utils.load_csv() filters = conf['filters'] region_size = conf['region_size'] region_stride = conf['region_stride'] filtered_rows = [ row for row in rows if utils.check_filter(row, conf['filters'])] train_rows, valid_rows, test_rows = utils.split_dataset( filtered_rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed']) conv = get_fprop_fn(False) print 'Getting features from train...' X_train = get_feats_from_rows( train_rows, conv, conf['stride']) print 'Getting features from valid...' X_valid = get_feats_from_rows( valid_rows, conv, conf['stride']) print 'Getting features from test...' X_test = get_feats_from_rows( test_rows, conv, conf['stride']) y_train = [row['classification'] == 'Malign' for row in train_rows] y_valid = [row['classification'] == 'Malign' for row in valid_rows] y_test = [row['classification'] == 'Malign' for row in test_rows] return X_train, y_train, X_valid, y_valid, X_test, y_test
def main(): args = parser.parse_args() header, dataset = utils.load_csv(args.input) if len(dataset) == 0: parser.error("Invalid input: file does not exist or is empty.") normalized = standardize(dataset) dendrogram_info = clusterize(normalized, args.linkage) fig = plot(dendrogram_info) fig.savefig(args.output + "_full.png", format="png") plt.show() weights = [args.average_weight, args.sd_weight] trees = cut(dendrogram_info, weights, args.class_range) fig = plot(trees) fig.savefig(args.output + ".png", format="png") plt.show() print("%d clusters were generated." % len(trees)) classified = [header + ["Classification"]] clusters = get_clusters(trees) for i in range(len(dataset)): classified.append(dataset[i] + [clusters[i]]) utils.save_csv(args.output + ".csv", classified)
def sort_orders(path_dir="data/process/order_201611.csv"): content = load_csv(path_dir) content = list(map(map_str2int, content)) for i in range(len(content)): content[i][1] = max(content[i][1] - content[i][0], 1) content.sort(key=lambda x: x[0]) write_csv(content, path_dir)
def __init__(self, feature_type, data_type): self.feature_type = feature_type self.data_type = data_type data_dir = f"data/features/{feature_type}/" ids_dir = f"data/features/ids/" tracks = utils.load_csv("tracks")["track"] self.genres = np.unique(tracks["genre_top"].to_numpy()).tolist() if data_type in ["test", "validate"]: filename = f"{data_type}_{feature_type}.npy" self.npy = np.load(f"{data_dir}{filename}", mmap_mode="r") self.ids = np.load(f"{ids_dir}{data_type}_ids.npy") self.genre_data = tracks["genre_top"].loc[self.ids].tolist() elif data_type == "train": self.npys = [] self.ids = [] self.genre_datas = [] for i in range(8): filename = f"{data_type}_{feature_type}_{i}.npy" ids = np.load(f"{ids_dir}{data_type}_ids_{i}.npy") npy = np.load(f"{data_dir}{filename}", mmap_mode="r") genre_data = tracks["genre_top"].loc[ids].tolist() self.npys.append(npy) self.ids.append(ids) self.genre_datas.append(genre_data) del tracks
def run(self): global total_removed if self.path is None: return print("%s starts..." % self.name) content = load_csv(self.path, select_list=[3,4,5,6]) content = list(map(map_str2float, content)) n = remove_baddata(content) lock.acquire() total_removed += n lock.release() content = np.array(list(map(map_list, content))) min_x = content[:,[0,2]].min() max_x = content[:,[0,2]].max() min_y = content[:,[1,3]].min() max_y = content[:,[1,3]].max() # update lock.acquire() self.min_x_list.append(min_x) self.max_x_list.append(max_x) self.min_y_list.append(min_y) self.max_y_list.append(max_y) lock.release() print("%s finished! There are %d removed." %(self.name, n))
def find_pos_range(path_dir="data/extracted"): min_x_list = [] max_x_list = [] min_y_list = [] max_y_list = [] n_total = 0 # total moved for _, _, files in os.walk(path_dir): for file_name in files: if file_name.startswith("order"): temp_path = os.path.join(path_dir, file_name) content = load_csv(temp_path, select_list=[3,4,5,6]) content = list(map(map_str2float, content)) n_total += remove_baddata(content) content = np.array(list(map(map_list, content))) min_x = content[:,[0,2]].min() max_x = content[:,[0,2]].max() min_y = content[:,[1,3]].min() max_y = content[:,[1,3]].max() min_x_list.append(min_x) max_x_list.append(max_x) min_y_list.append(min_y) max_y_list.append(max_y) print(min(min_x_list)) # 103.0002196712431 print(max(max_x_list)) # 120.35693932767293 print(min(min_y_list)) # 22.86432541244561 print(max(max_y_list)) # 40.144055627798586 print(n_total)
def main(): parser = fix_csv_parser.get_parser() args = parser.parse_args() extension = "tsv" if args.tsv else "csv" if args.csv else None delimiter = "\t" if args.tsv else "," if args.csv else None quotechar = '"' for csv_path in args.csv_paths: csv_path = Path(csv_path) destination_folder = (csv_path.parent if args.destination_folder is None else args.destination_folder) destination_folder = Path(destination_folder) os.makedirs(destination_folder, exist_ok=True) new_name = f"{utils.get_filename_without_extension(csv_path)}.{extension}" destination_path = destination_folder / new_name rows = utils.load_csv(csv_path=csv_path) utils.save_rows( rows=rows, destination_path=destination_path, delimiter=delimiter, quotechar=quotechar, ) utils.save_rows( rows=rows, destination_path=str(destination_path)[:-4] + "-fixed.csv", delimiter=",", quotechar=quotechar, )
def get_feats_in_partitions(): """ Extracts features from all dataset and split them in train validation and test sets """ conf = utils.get_config() paths = utils.get_paths() rows = utils.load_csv() filters = conf['filters'] region_size = conf['region_size'] region_stride = conf['region_stride'] filtered_rows = [ row for row in rows if utils.check_filter(row, conf['filters']) ] train_rows, valid_rows, test_rows = utils.split_dataset( filtered_rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed']) conv = get_fprop_fn(False) print 'Getting features from train...' X_train = get_feats_from_rows(train_rows, conv, conf['stride']) print 'Getting features from valid...' X_valid = get_feats_from_rows(valid_rows, conv, conf['stride']) print 'Getting features from test...' X_test = get_feats_from_rows(test_rows, conv, conf['stride']) y_train = [row['classification'] == 'Malign' for row in train_rows] y_valid = [row['classification'] == 'Malign' for row in valid_rows] y_test = [row['classification'] == 'Malign' for row in test_rows] return X_train, y_train, X_valid, y_valid, X_test, y_test
def main(): """Goes through all the correspondance files, and foreach of our sitc codes, fetches the descriptions used by the harmonized systems. These extra descriptions will later be used to better match based on text similarity""" sitc_codes = load_csv(SITC2_FILE_PATH) # load all harmonized system categories hs_codes = {hs: {} for hs in HARMONIZED_SYSTEM_NAMES} for hs_system in HARMONIZED_SYSTEM_NAMES: hs = load_csv(HS_FILE_PATH_RAW.format(hs_system=hs_system)) hs_codes[hs_system] = hs # load all correspondence tables hs_correspondence_tables = {hs: {} for hs in HARMONIZED_SYSTEM_NAMES} for hs_system in HARMONIZED_SYSTEM_NAMES: hs = load_correspondence_tables( CORRESPONDENCE_FILE_PATH_PREPROCESSED.format(hs_system=hs_system), system=hs_system) hs_correspondence_tables[hs_system] = hs sitc_codes_enriched = {code: set() for code in sitc_codes.keys()} # foreach sitc_code, find its correspondent from hs_codes and store them as set for sitc_code in sitc_codes.keys(): # go through all mappings and fetch its description for hs_system, mappings in hs_correspondence_tables.items(): mapping = mappings.get(sitc_code) if mapping: # might need to change to get sitc_codes_enriched[sitc_code].add( hs_codes[hs_system][mapping]) print( f'in total {len(sitc_codes_enriched)} and only {len([c for c, v in sitc_codes_enriched.items() if v])} ' f'extended') # store the mapped stuff with open(ENRICHED_SITC_CODES_FILE_PATH, 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['ID', 'Mapping']) for sitc_code, desc in sitc_codes_enriched.items(): if desc: writer.writerow([sitc_code, '~'.join(desc)]) print(f'Extended mapping stored under {ENRICHED_SITC_CODES_FILE_PATH}')
def main(): csv_path = 'data/all_test_clean.csv' tweets, targets, labels = load_csv(csv_path) print('--- LOADED CSV ---') model = load_bert() print('--- LOADED MODEL ---') preds = predict(model, tweets, targets) save_npy(preds, 'ada_bert', 'preds/') print('--- SAVED PREDS ---') print_metrics(preds, labels, 'ada_bert')
def total_orders(path_dir="data/extracted"): n = 0 for _, _, files in os.walk(path_dir): for file_name in files: if not file_name.startswith("order"): continue temp_path = os.path.join(path_dir, file_name) content = load_csv(temp_path, select_list=[3,4,5,6]) n += len(content) print(n) # 7065937
def main(): parser = argparse.ArgumentParser() parser.add_argument('-t', '--test', help="A test directory containing map.pgm, measure.csv, control.csv, and ground.csv files", required=True) parser.add_argument('-s', '--states', help='The file containing the starting states to use') parser.add_argument('-v', '--visualizer', action='store_const', const=True, help='Add this flag to turn on the visualizer', default=False) parser.add_argument('-n', '--numstart', type=int, default=200) args = parser.parse_args() lmap = utils.load_map('tests/' + args.test + '/map.pgm') controls = utils.load_csv('tests/' + args.test + '/control.csv') #a Tx2 array of T (delta phi, velocity)'s measurements = utils.load_measurements('tests/' + args.test + '/measure.csv') #a TxMx2 array of T sets of M measurements containing a degree and a measured distance at that degree true_start = utils.load_csv('tests/' + args.test + '/ground.csv') if args.states: start_posns = utils.load_csv(args.states) #a Nx3 array of N (x,y,phi)'s else: start_posns = generate_init_states(lmap, args.numstart) print("Using particle_filter function...") particle_filter(start_posns, controls, measurements, lmap, true_start, args.visualizer)
def read_csv(input_file): """Reads a tab separated value file.""" df = load_csv(input_file, header=0).fillna('|') print(df.head()) jobcontent = df['content'].tolist() print("__________________________________________") jlabel = df.loc[:, hp.label_vocabulary].values print('Read csv finished!(1)') print(jlabel.head()) return shuffle_one([[jlabel[i], jobcontent[i]] for i in range(len(jlabel)) if type(jobcontent[i]) == str])
def _read_csv(cls, input_file): """Reads a tab separated value file.""" df = load_csv(input_file, header=0).fillna('|') jobcontent = df['content'].tolist() jlabel = df.loc[:, hp.label_vocabulary].values lines = [[jlabel[i], jobcontent[i]] for i in range(len(jlabel)) if type(jobcontent[i]) == str] lines2 = shuffle_one(lines) print('Read csv finished!(1)') print('Head data:', lines2[0:5]) print('Length of data:', len(lines2)) return lines2
def main(): exp_name = f'baseline_{now()}' device, log, result_dir = setup(exp_name, conf) train_df = load_csv(conf.train_csv) if conf.npy: train_images = np.load(conf.train_images) else: train_images = pd.read_parquet(conf.train_images) train_df["gr"] = 0 train_df["cd"] = 0 train_df["vd"] = 0 train_df["image_mean"] = 0 models = [f"se_resnext50_f{i}.pkl" for i in range(5)] preds = np.zeros((len(train_df), conf.gr_size + conf.vd_size + conf.cd_size)) image_stats = np.zeros((len(train_df), 2)) log.info('done') for i in range(5): model = ResNet(conf, arch_name=conf.arch, input_size=conf.image_size) model.load_state_dict(torch.load(models[i])) model.to(device) ds = val_split(train_df, train_images, fold=i) _, val_ds, _, val_images = ds['train'], ds['val'], ds['train_images'], ds['val_images'] test_preds = predict(model, val_ds, val_images, valid_transform, device) print(test_preds.shape) te_ind = ds['te_ind'] preds[te_ind] += test_preds image_stats[te_ind, 0] = val_images.mean((1, 2)) image_stats[te_ind, 0] = val_images.std((1, 2)) preds = np.concatenate([preds, image_stats], axis=1) for t in ["grapheme_root", "vowel_diacritic", "consonant_diacritic"]: rf = RandomForestClassifier(n_jobs=16) # train = xgb.DMatrix(preds, label=train_df[t]) # params = {"max_depth": 4, "nthread": 16, "objective": "multi:softmax", # "eval_metric": ["merror", "mlogloss"], "num_class": conf.gr_size} # xgb.cv(params, train, num_boost_round=1000, nfold=5, seed=conf.seed, # early_stopping_rounds=40, verbose_eval=10) rf.fit(preds, train_df[t]) with open(f"{t}_rf2.pkl", "wb") as f: joblib.dump(rf, f)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_file', help="Input csv file with data.") args = parser.parse_args() dataset = utils.load_csv(args.input_file) train_set, test_set = corpus.split_dataset(dataset, 0.67) separated = corpus.separate_by_class(train_set) summaries = corpus.summarize_by_class(train_set) predictions = predict_set(summaries, test_set) accuracy = utils.get_accuracy(test_set, predictions) print('Accuracy: {0}%').format(accuracy)
def main(): db = "{}/data/db".format(environ["WD"]) if not exists(db): con = connect(db) data = \ pipe( load_csv() , rename_columns , clean_date ) data.to_sql(name="data", con=con) else: print("data already compiled to {}".format(db))
def main(): import sys from utils import load_csv, get_stencil_num raw_data = load_csv(sys.argv[1]) k_l = set() for k in raw_data: k_l.add((get_stencil_num(k), k['Global NX'])) k_l = list(k_l) bsz_l = set() for k in raw_data: if k['Multi-wavefront updates']=='0': continue bsz_l.add(k['Multi-wavefront updates']) bsz_l = sorted(list(bsz_l)) for k, N in k_l: for bsz in bsz_l: gen_res(raw_data, int(k), int(bsz), int(N))
def main(): import sys from ics_utils import get_stencil_num from utils import load_csv raw_data = load_csv(sys.argv[1]) k_l = set() for k in raw_data: k_l.add(get_stencil_num(k)) k_l = list(k_l) n_l = set() for k in raw_data: n_l.add(k['Global NX']) n_l = list(n_l) for k in k_l: for N in n_l: gen_res(raw_data, int(k), int(N))
def get_feats_from_csv_in_partitions(): """ Extract the original features that are distributed in the dataset. Features are splitted according with the config.yaml file. """ conf = utils.get_config() rows = [row for row in utils.load_csv() if utils.check_filter(row, conf['filters'])] train_rows, valid_rows, test_rows = utils.split_dataset( rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed']) X_train, y_train, X_valid, y_valid, X_test, y_test = [], [], [], [], [], [] prefixes = ['t_', 'i_', 's_'] # Feature names' prefixes datasets = [(X_train, y_train, train_rows), (X_test, y_test, test_rows), (X_valid, y_valid, valid_rows)] out = [] for X, y, rows in datasets: for row in rows: X.append( [float(v) for k, v in row.iteritems() if len(filter(k.startswith, prefixes)) > 0]) y.append(int(row['classification'] == 'Malign')) out.extend((np.asarray(X), np.asarray(y))) return out
def main(): import sys from utils import select_fields, load_csv raw_data = load_csv(sys.argv[1]) stencil='7_pt_const' rows = [ {'Thread group size':'0' , 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'6' }, {'Thread group size':'1' , 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'}, {'Thread group size':'2' , 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'}, {'Thread group size':'5' , 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'}, {'Thread group size':'10', 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'} ] create_table(raw_data, rows, stencil) stencil='7_pt_var' rows = [ {'Thread group size':'0' , 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'6'}, {'Thread group size':'1' , 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'8'}, {'Thread group size':'2' , 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'}, {'Thread group size':'5' , 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'}, {'Thread group size':'10', 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'} ] create_table(raw_data, rows, stencil) stencil='25_pt_var' rows = [ {'Thread group size':'0' , 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'8' }, {'Thread group size':'1' , 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'7'}, {'Thread group size':'2' , 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'8'}, {'Thread group size':'5' , 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'10'}, {'Thread group size':'10', 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'10'} ] create_table(raw_data, rows, stencil)
X[i] = features[np.nonzero(int(row['segmentation_id']) == segm_ids)][0] y[i] = utils.is_positive(row) return X, y rng = [2014, 12, 5] rng = make_np_rng(None, rng, which_method='uniform') scale_feats = True n_runs = 20 C_range = 10.0 ** np.arange(-8, 8) train_scores = np.zeros((n_runs, len(C_range))) valid_scores = np.zeros((n_runs, len(C_range))) fit_threshold = True conf_file = sys.argv[1] if len(sys.argv) > 1 else None conf = utils.get_config(conf_file) features = np.empty([len(utils.load_csv()), 0]) #f_list = ['hcfeats', 'imnet', 'cnn'] f_list = ['cnn'] if 'imnet' in f_list: rows = utils.load_csv() feats, y = fe_extraction.get_feats_from_imagenet(rows) features = np.hstack((features, feats)) segm_ids = np.asarray([int(row['segmentation_id']) for row in rows]) if 'hcfeats' in f_list: rows = utils.load_csv(conf['csv_features_file']) feats, y = fe_extraction.get_feats_from_csv( rows, prefixes=['s_', 't_', 'i_']) feats = np.asarray(feats) features = np.hstack((features, feats)) segm_ids = np.asarray([int(row['segmentation_id']) for row in rows])