def create_dataset(params: argparse.Namespace): start = time.time() train_set, tr_shape = read_hdf(os.path.join(params.dataset_path, 'train.h5')) test_set, ts_shape = read_hdf(os.path.join(params.dataset_path, 'test.h5')) input_shape = params.tile_shape + [2, ] num_classes = NUM_CLASSES seed = 0 size = input_shape[0] stride = int(input_shape[0] / 8) break_tiles_info = (seed, size, stride) train_tiles_input, train_tiles_label = view_as_window(train_set, break_tiles_info, tr_shape, num_classes) test_tiles_input, test_tiles_label = view_as_window(test_set, break_tiles_info, ts_shape, num_classes) utils.makedir(params.output_path) save_hdf(os.path.join(params.output_path, 'train.h5'), train_tiles_input, train_tiles_label) save_hdf(os.path.join(params.output_path, 'test.h5'), test_tiles_input, test_tiles_label) print(f"Total time for dataset generation: {time.time() - start}")
def create_overview(files, start, end, num_files_per_group=8, type_output='B', to_mp3=False, artist='Glossika', album='Glossika Training', prefix=''): if num_files_per_group == 0: num_files_per_group = get_num_files(len(files)) result = [] old_start = start old_end = end start = 0 end = old_end - old_start + 1 for i in range(math.ceil((end - start + 1) / num_files_per_group)): sub_list = files[start:min(start + num_files_per_group, end)][:] result = result + sub_list + sub_list start = start + num_files_per_group type_num = '1' if type_output == 'B' else '2' dir_name = OUTPUT_ALL + '(wav)/' + sub_directory() name = _get_name(prefix, type_num, dir_name, old_start, old_end) makedir(dir_name) make_track(result, name) convert_mp3(to_mp3, name, dir_name.replace('wav', 'mp3'), artist, album) print('Shuffle Files: Done')
def make_pie(sim,select,phi): plt.figure(figsize=(2,2)) selectors = [ sim.model.select[name].union(sim.model.select[select]) for name in names ] SIR = np.array([ modelutils.taccum(sim.outputs[out],**selector).islice(t=sim.t[-1]) for selector in selectors ]) SIR = SIR / SIR.sum() colors = [selector.color.lighten(light) for selector,light in zip(selectors,lights)] labels = [sim.model.select[name].label for name in names] reorder = lambda x: [x[0],x[2],x[1]] # reorder = lambda x: [x[0],x[1]] plt.pie(reorder(SIR), colors=reorder(colors), startangle=90, counterclock=True ) plt.tight_layout(pad=-1.8) figdir = os.path.join(config.path['figs'],'flows','phi={}'.format(phi)) utils.makedir(figdir) if config.save: plt.savefig(os.path.join(figdir,'{}-{}.pdf'.format('flow',select,phi)),transparent=True) else: plt.show() plt.close() make_legend(labels,colors)
def train(): # train_val_df, test_df, xray14_labels = load_from_text(cfg.data_root) train_val_df, test_df, xray14_labels = load_from_npy() train_df, valid_df = split_train_val(train_val_df, ratio=0.25) print('*'*40, 'tain data', '*'*40) describe_data(train_df, xray14_labels) print('*' * 40, 'val data', '*' * 40) describe_data(valid_df, xray14_labels) train_gen, val_gen = creat_tain_val_generator(train_df, valid_df, cfg.input_shape[:-1], batch_size=cfg.batch_size) model = Xception(cfg.input_shape, include_top=True, n_class=len(xray14_labels), pretrain_weights='imagenet') model.compile(optimizer=Adam(), loss=[focal_loss()], metrics=['binary_accuracy', 'mae']) log_path = os.path.join(cfg.log_dir, 'xray14_focal') makedir(log_path) weights_path = os.path.join(log_path, cfg.weights_name) checkpoint = ModelCheckpoint(weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_weights_only=True) callbacks = [checkpoint, LearningRateScheduler(lr_schedule)] train_steps = get_number_of_steps(len(train_df), cfg.batch_size)*5 val_steps = get_number_of_steps(len(valid_df), cfg.batch_size)*2 model.fit_generator(train_gen, epochs=cfg.epochs, steps_per_epoch=train_steps, callbacks=callbacks, validation_data=val_gen, workers=cfg.n_works, max_queue_size=cfg.n_queue, use_multiprocessing=True, validation_steps=val_steps, initial_epoch=0)
def __init__(self, data_dir, pickle_file, class_id_to_name_map, save_path_viz, train=True, visualize=True): super(VinBigDataset, self).__init__() self.data_dir = data_dir self.pickle_file = pickle_file self.bboxes_info = pickle.load(open(pickle_file, 'rb')) self.save_path_viz = None self.visualize = visualize self.train = train self.class_id_to_name_map = class_id_to_name_map if self.visualize: self.save_path_viz = save_path_viz makedir(self.save_path_viz) if self.train: self.transforms = train_transforms self.transforms_only_image = train_transforms_only_image else: self.transforms = test_transforms self.transforms_only_image = test_transforms_only_image self.image_paths = sorted(glob.glob(os.path.join(data_dir, '*')))
def projection(embed, save_dir, embed_name): data = pd.read_csv("data/stereotype_list.csv") X = data["male"].values.tolist() + data["female"].values.tolist() X_words, X_emb = get_word_vectors(embed, X) gender_direction = embed["he"] - embed["she"] gender_direction = gender_direction project = [] for x in X_emb: sim = cosine(x, gender_direction) project.append(sim) project = np.array(project) avg_project = np.abs(project).mean() orders = np.argsort(project) plt.figure() plt.scatter(project, range(len(project)), s=10) plt.yticks([]) plt.xlim([-0.5, 0.5]) plt.xlabel("Similarity") for i in range(5): plt.text(project[orders[i]], orders[i] + 0.2, X_words[orders[i]]) plt.text(project[orders[-(i + 1)]], orders[-(i + 1)] + 0.2, X_words[orders[-(i + 1)]]) plt.savefig( makedir([save_dir, "projection"], "{}_plot.png".format(embed_name))) score = pd.DataFrame([[avg_project]], columns=["score"]) score.to_csv(makedir([save_dir, "projection"], "{}_acc.csv".format(embed_name)), index=False) return avg_project
def get_path(self, model_path, probe_data): path_list = model_path.split('/')[:-1] path_list[1] = 'results' path_list += [probe_data, ''] path = '/'.join(path_list) self.path = path makedir(self.path)
def base_dpn68_mixup(): task_name = "base_dpn68_mixup" makedir(os.path.join(cfg.log_dir, task_name)) print("Task Name is ", task_name) log = Logger(os.path.join(cfg.log_dir, task_name + '_log.txt'), mode="a") log("\n\n" + '-' * 51 + "[START %s]" % datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "-" * 51 + "\n\n") print(cfg, file=log) train_loader, val_loader, test_loader, mix_loader = get_dataloader( mix_up=True) model = get_model()['dpn68'].cuda() criterion = get_loss()['bce'].cuda() optimizer = optim.SGD(model.parameters(), lr=cfg.lr, momentum=0.9, weight_decay=1e-6) scheduler = lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.1) model = train(task_name, model, optimizer, criterion, scheduler, train_loader, val_loader, mix_loder=mix_loader, log=log) submission_best_loss(task_name, model, test_loader, log=log)
def task4(): task_name = "base_inception_restnet" print("Task Name is ", task_name) makedir(os.path.join(cfg.log_dir, task_name)) log = Logger(os.path.join(cfg.log_dir, task_name + '_log.txt'), mode="a") log("\n\n" + '-' * 51 + "[START %s]" % datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "-" * 51 + "\n\n") print(cfg, file=log) train_loader, val_loader, test_loader = get_dataloader() model = get_mdoel()['inceptionresnetv2'].cuda() criterion = get_loss()['bce'].cuda() optimizer = optim.Adam(model.parameters(), lr=cfg.lr) milestones = [(1e-3, 0), (1e-2, 5), (1e-3, 40), (1e-4, 50), (5e-5, 60), (1e-4, 70), (1e-5, 80), (5e-5, 90), (1e-6, 100)] scheduler = MultiStepLR(optimizer, milestones) model = train(task_name, model, optimizer, criterion, scheduler, train_loader, val_loader, log=log) submission_best_loss(task_name, model, test_loader, log=log)
def task1(): task_name = "base_dpn62_balance" makedir(os.path.join(cfg.log_dir, task_name)) log = Logger(os.path.join(cfg.log_dir, task_name + '_log.txt'), mode="a") log("\n\n" + '-' * 51 + "[START %s]" % datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "-" * 51 + "\n\n") print(cfg, file=log) train_loader, val_loader, test_loader = get_dataloader() model = DPN68() model.cuda() criterion1 = nn.BCEWithLogitsLoss().cuda() criterion2 = BalanceLoss().cuda() criterions = [criterion1, criterion2] optimizer = optim.SGD(model.parameters(), lr=cfg.lr, momentum=0.9, weight_decay=1e-5) scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) model = train(task_name, model, optimizer, criterions, scheduler, train_loader, val_loader, log=log) submission_best_f1(task_name, model, test_loader, log=log)
def fname_fig(compare, output, selector, **params): path = os.path.join(config.path['figs'], 'compare') utils.makedir(path) return os.path.join( path, '-'.join( [config.model, compare, output, selector] + ['{}={}'.format(name, value) for name, value in params.items()]) + '.pdf')
def train(): train_val_df, test_df, xray14_labels = load_from_text(cfg.data_root) train_df, val_df = split_patients_by_patient_ID(train_val_df, 4) print('*' * 40, 'tain data', '*' * 40) describe_data(train_df, xray14_labels) print('*' * 40, 'val data', '*' * 40) describe_data(val_df, xray14_labels) train_transformer = ImageTransformer(samplewise_normalization=True, rotation_range=10, width_shift_range=0.1, height_shift_range=0.1, shear_range=0.1, zoom_range=[0.7, 1.5], horizontal_flip=True) val_transformer = ImageTransformer(samplewise_normalization=True) train_gen = random_image_generator(train_transformer, train_val_df, cfg.input_shape[:-1], xray14_labels, batch_size=cfg.batch_size, color_mode='grayscale') val_gen = ImageGeneratorFromPath(val_transformer, test_df['path'], test_df['xray14_vec'], shuffle=False, target_size=cfg.input_shape[:-1], batch_size=cfg.batch_size) model = Xception(cfg.input_shape, include_top=True, n_class=len(xray14_labels), pretrain_weights='imagenet') model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy', 'mae']) log_path = os.path.join(cfg.log_dir, 'random_disease') makedir(log_path) weights_path = os.path.join(log_path, cfg.weights_name) checkpoint = ModelCheckpoint(weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_weights_only=True) callbacks = [checkpoint] train_steps = get_number_of_steps(len(train_df), cfg.batch_size) val_steps = get_number_of_steps(len(val_df), cfg.batch_size) model.fit_generator(train_gen, epochs=cfg.epochs, steps_per_epoch=train_steps, callbacks=callbacks, validation_data=val_gen, workers=cfg.n_works, max_queue_size=cfg.n_queue, use_multiprocessing=True, validation_steps=val_steps, initial_epoch=0)
def main(args): # assert int(args.depth_sensor) in SENSOR_PAIRS[args.thermal_sensor] depth_data_dir = os.path.join(args.data_root_dir, args.date, '10.233.219.'+args.sensor) print(depth_data_dir) directory = os.path.join("{}_{}".format(args.date, args.sensor)) results_dir_root = os.path.join(args.result_root, "results{:d}_{:s}".format(TASK_SIZE, args.date)) utils.makedir(results_dir_root) results_dir = os.path.join(results_dir_root, directory) utils.makedir(results_dir) ACTIONS = { 0: 'Negative', 1: 'Get in bed', 2: 'Get out of bed', 3: 'Get in chair', 4: 'Get out of chair', 5: 'Moving in bed', 6: 'Walking', 7: 'Lying on bed', 8: 'Sitting on bed', 9: 'Sitting in chair', 10: 'Standing', 11: 'Delete', } # for MLHC: # 5: turning patient # 6: delete task_state = TaskState(ACTIONS, args.date, depth_data_dir) print("Num tasks: {}".format(task_state.num_tasks)) job, video_state, = 1, None images = sorted(glob(os.path.join(depth_data_dir, '*.jpg'))) print('# frames:', len(images)) sys.stdout.flush() iid = 0 while True: if job == 1: # Get the next task task_id = task_state.task_id video_state = VideoState(args.sensor, results_dir, task_state) utils.print_info(task_state, video_state) depth_image = video_state.get_images() image = depth_image utils.draw_info(image, task_state, video_state) cv2.imshow('Video', image) job = utils.read_key(cv2.waitKey(0), task_state, video_state) if job == -1: break
def create_overview_0(start, end, to_mp3, prefix=''): result = [ 'outputB/FL-%04d-B%s' % (i, '.wav') for i in range(start, end + 1) ] dir_name = OUTPUT_ALL + '(wav)/' + sub_directory() name = _get_name(prefix, '0', dir_name) makedir(dir_name) make_track(result, name) folder_name = GLOSSIKA_OVERVIEW convert_mp3(to_mp3, name, dir_name.replace('wav', 'mp3'), artist, album)
def __init__(self, dataset_parameters, base_csv, dataset_dirs): self.dataset_parameters = dataset_parameters self.dataset_parameters.img_shape = np.asarray( self.dataset_parameters.img_shape) self.base_dataset_dir = dataset_parameters.base_dataset_dir self.dataset_dirs = dataset_dirs self.base_csv = base_csv self.transformation_parameters = namedtuple( "Transformation", ["center", "angle", "scale", "offset"]) utils.makedir(dataset_parameters.data_preprocessing_output_dir)
def create_accent_grammar(list_of_tracks, num_files_per_group, num_plays, num_copies=1, prefix='', to_mp3=False, artist='Accent', album='Accent Training', shuffled='', grammar=False): type_file = 'Accent' if not grammar else 'Grammar' artist = 'Accent' if not grammar else 'Grammar' album = 'Accent Training' if not grammar else 'Grammar Training' input_files = [] for track in list_of_tracks: sub_input_files = [] for f in sorted(os.listdir(type_file + '/' + type_file + 'EN/')): if not (f[-3:] == 'mp3' or f[-3:] == 'wav'): continue if grammar: u = f[1:4] else: u = f[6:9] if u == '%03d' % (track): sub_input_files.append(type_file + '/' + type_file + 'EN/' + f) if shuffled == "group": shuffle(sub_input_files) input_files.extend(sub_input_files) if shuffled == "all": shuffle(input_files) if prefix == '' or prefix == None: prefix = get_prefix(list_of_tracks, grammar) if num_files_per_group == 0: num_files_per_group = get_num_files(len(input_files)) generate_from_list_of_files(input_files, type_file + '/' + type_file + 'VN/', type_file, False) files = [ 'output' + type_file + '/' + f.split('/')[-1][:-6] + type_file + f.split('/')[-1][-4:] for f in input_files ] # Shuffle files for copies in range(int(num_copies)): result = shuffle_track(files, num_plays, num_files_per_group) dir_name = OUTPUT_ALL + '(wav)/' + sub_directory() makedir(dir_name) name = get_name(dir_name, prefix, num_plays) make_track(result, name) convert_mp3(to_mp3, name, dir_name.replace("wav", "mp3"), artist, album) rmtree('output' + type_file)
def train(): train_val_df = load_train_csv(cfg) train_df, val_df = split_train_val(train_val_df, 0.25) train_gen = BaseGenerator(train_df, cfg.train_dir, batch_size=cfg.batch_size, aug_args=cfg.aug_args, target_shape=cfg.input_shape[:2], use_yellow=False) val_gen = BaseGenerator(val_df, cfg.train_dir, batch_size=cfg.batch_size, aug_args=cfg.aug_args, target_shape=(512, 512), use_yellow=False) if n_gpus > 0: with tf.device('/cpu:0'): cpu_model = Xception(cfg.input_shape, include_top=True, n_class=len(cfg.label_names)) model = multi_gpu_model(cpu_model, gpus=n_gpus) else: model = Xception(cfg.input_shape, include_top=True, n_class=len(cfg.label_names)) model.compile(optimizer=Adam(1e-3), loss=roc_auc_loss, metrics=['binary_accuracy', 'mae']) log_dir = os.path.join(cfg.log_dir, 'base_xception') makedir(log_dir) weights_path = os.path.join(log_dir, cfg.weights_file) checkpoint = ModelCheckpoint(weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_weights_only=True) if n_gpus > 0: del checkpoint checkpoint = MultiGPUCheckpoint(weights_path, cpu_model, monitor='val_loss') callbacks = [checkpoint] callbacks += [ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1, mode='min')] train_steps = get_number_of_steps(len(train_df), cfg.batch_size) val_steps = get_number_of_steps(len(val_df), cfg.batch_size) model.fit_generator(train_gen, epochs=cfg.epochs, steps_per_epoch=train_steps, callbacks=callbacks, validation_data=val_gen, workers=cfg.n_works, max_queue_size=cfg.n_queue, use_multiprocessing=True, validation_steps=val_steps, initial_epoch=0) K.clear_session()
def create_overview_en(files, start, end, to_mp3, prefix=''): makedir('outputSE') result = [] for f in files: silence_file = 'outputSE/' + f.split('/')[-1][:-4] + 's.' + f[-3:] create_silence_from_file(f, silence_file) result += [GLOSSIKA_EN + f.split('/')[-1], silence_file] dir_name = OUTPUT_ALL + '(wav)/' + sub_directory() name = _get_name(prefix, 'en', dir_name) makedir(dir_name) make_track(result, name) convert_mp3(to_mp3, name, dir_name.replace('wav', 'mp3'), artist, album)
def create_review(files, start, end, num_plays, num_files_per_group, log=False, log_tracks=0, num_copies=1, to_mp3=False, artist='Glossika', album='Glossika Training', name=None): ''' Combine files to make them useful for Glossika Traning numPlays: each track is played numPlays times numFilesPerTrack: the number of tracks per playlist start: start track number end: end track number log: set True to print debug information logTracks: use in debug mode numCopies: number of copies of output file toMP3: set True to convert output file to .mp3 artist: album: if toMP3=True, use these values to set meta information ''' makedir('outputB') # if shuffled == 'all': shuffle(files) if num_files_per_group == 0: num_files_per_group = get_num_files(len(input_files)) # Shuffle files prefix = 'Review_%04d_%04d' % (start, end) if name is not None and not name == '': prefix = name for copies in range(int(num_copies)): result = shuffle_track(files, num_plays, num_files_per_group) dir_name = OUTPUT_ALL + '(wav)/' + sub_directory() makedir(dir_name) name = get_name(dir_name, prefix, num_plays) make_track(result, name) convert_mp3(to_mp3, name, dir_name.replace("wav", "mp3"), artist, album) print_log(log, log_tracks, result) print('Shuffle Files: Done')
def repair(X_train_mv, save_dir=None): mv_columns = X_train_mv.isnull().any(axis=0) mv_columns = list(mv_columns[mv_columns == True].index) repair_dict = {} for c in mv_columns: X_c = X_train_mv[c].dropna().values cand = set(np.linspace(min(X_c), max(X_c), 4)) cand.add(X_c.mean()) repair_dict[c] = sorted(list(cand)) c1, c2 = mv_columns X_train_repairs = {} for i, v1 in enumerate(repair_dict[c1]): for j, v2 in enumerate(repair_dict[c2]): name = "{}_{}".format(i, j) if name == "2_2": name = "mean" imp_dict = {c1: v1, c2: v2} X_train_repairs[name] = X_train_mv.fillna(value=imp_dict) if save_dir is not None: for name, X_imp in X_train_repairs.items(): X_imp.to_csv(utils.makedir([save_dir], "{}.csv".format(name)), index=False) return X_train_repairs
def base_dpn92_800_kfold(k=5, n_select=0): task_name = "dpn92_8_KF" + str(n_select) makedir(os.path.join(cfg.log_dir, task_name)) log = Logger(os.path.join(cfg.log_dir, task_name + '_log.txt'), mode="a") log("\n\n" + '-' * 51 + "[START %s]" % datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "-" * 51 + "\n\n") print(cfg, file=log) train_loader, val_loader, test_loader = get_kfold_dataloader( k, n_select=n_select, use_extra=True, target_shape=(800, 800)) model = get_model()['dpn92']().cuda() # criterion = get_loss()['bce'].cuda() # optimizer = optim.SGD(model.parameters(), lr=cfg.lr, momentum=0.9, weight_decay=1e-4) # scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.35) # model = train(task_name, model, optimizer, criterion, scheduler, train_loader, val_loader, log=log) submission_best_loss(task_name, model, test_loader, log=log)
def run_cp_clean(data, model, n_jobs=4, debug_dir=None, restore=False, method="cpclean", sample_size=100): X_train_repairs = np.array( [data["X_train_repairs"][m] for m in data["repair_methods"]]) cleaner = CPClean(K=model["params"]["n_neighbors"], n_jobs=n_jobs, random_state=1) debugger = Debugger(data, model, utils.makedir([debug_dir, method])) cleaner.fit(X_train_repairs, data["y_train"], data["X_val"], data["y_val"], gt=data["X_train_gt"], X_train_mean=data["X_train_repairs"]["mean"], debugger=debugger, restore=restore, method=method, sample_size=sample_size) val_acc = cleaner.score(data["X_val"], data["y_val"]) test_acc = cleaner.score(data["X_test"], data["y_test"]) cp_result = { "test_acc_cp": test_acc, "val_acc_cp": val_acc, "percent_clean": debugger.percent_clean } return cp_result
def save_data(data_dict, info, save_dir): for name, data in data_dict.items(): if isinstance(data, pd.DataFrame): data.to_csv(utils.makedir([save_dir], "{}.csv".format(name)), index=False) with open(os.path.join(save_dir, 'info.json'), 'w') as f: json.dump(info, f, indent=4)
def save_log(self): columns = [ "n_iter", "n_val", "selection", "time", "percent_cc", "percent_clean", "clean_val_acc", "gt_val_acc", "mean_val_acc", "clean_test_acc", "gt_test_acc", "mean_test_acc" ] logging_save = pd.DataFrame(self.logging, columns=columns) logging_save.to_csv(utils.makedir([self.debug_dir], "details.csv"), index=False)
def main(): ''' 主函数, 下载视频 :return: None ''' _id = get_douyin_id() username = get_username(_id) if not username: return else: makedir(username) video_urls = get_all_video_urls(_id, 0) if not video_urls: return download_all_videos(video_urls, username)
def upload(release = True): "Upload the ``project`` directory into the server" import time import os if release and not env.path.startswith('/'): result = run('pwd').split(' ')[0] env.path = os.path.join(result,env.path) release_name = time.strftime('%Y%m%d-%H%M%S') utils.get_directories(release_name, release) env.tarfile = archive(release) # put tar package if release: utils.makedir(env.release_path) run('cd; mkdir %(logdir)s; mkdir %(confdir)s' % env) put(env.tarfile, '%(path)s' % env) run('cd %(release_path)s && tar zxf ../%(tarfile)s' % env) run('rm %(path)s/%(tarfile)s' % env) local('rm %(tarfile)s' % env)
def download_tiles_by_xyz(out_dir: Union[str, Path], url_base: str, x_start, x_end, y_start, y_end, z): out_dir = makedir(out_dir) for x in range(x_start, x_end + 1): for y in range(y_start, y_end + 1): print(x, y) url_tile = url_base.format(X=x, Y=y, Z=z) getImgFromUrl(out_dir, url_tile, x, y, z) time.sleep(0.005)
def download_nls(locations_fn: str, out_dir_root: str, z=16): out_dir_root = makedir(out_dir_root) with open(locations_fn) as f: city_geos = json.load(f) print(list(city_geos.keys())) for city, geo in tqdm.tqdm(city_geos.items(), desc='city-loop'): xmin, xmax, ymin, ymax = geo['xmin'], geo['xmax'], geo['ymin'], geo[ 'ymax'] z = geo.get('z', z) print('=' * 80) print('Started ', city) out_dir = Path(out_dir_root) / city out_dir = makedir(out_dir) url_base = ts.tile_sources[ts.NLS.name] download_tiles_by_lnglat(out_dir, url_base, xmin, xmax, ymin, ymax, z) print(f'Done {city}\n\n')
def weat(embed, save_dir, embed_name): def association(w, M, F): s = 0 for m in M: s += cosine(w, m) / len(M) for f in F: s -= cosine(w, f) / len(F) return s def S(X, Y, M, F): s = 0 for x in X: s += association(x, M, F) for y in Y: s -= association(y, M, F) return s def test(X, Y, M, F): s0 = S(X, Y, M, F) np.random.seed(1) U = np.vstack([X, Y]) s_hat = [] for i in range(10000): idx = np.random.permutation(len(U)) X_hat = U[idx[:len(X)]] Y_hat = U[idx[len(X):]] si = S(X_hat, Y_hat, M, F) s_hat.append(si) s_hat = np.array(s_hat) pvalue = (s_hat > s0).mean() return pvalue with open("data/weat.json") as f: data = json.load(f) vectors = {} for name, words in data.items(): _, vectors[name] = get_word_vectors(embed, words) M = vectors["M"] F = vectors["F"] X = vectors["B1_X"] Y = vectors["B1_Y"] pvalues = test(X, Y, M, F) score = pd.DataFrame([pvalues], columns=["score"]) score.to_csv(makedir([save_dir, "weat"], "{}_score.csv".format(embed_name)), index=False) return pvalues
def analogy(embed, save_dir, embed_name): bias_analogy_f = open("data/Sembias") definition_num = 0 none_num = 0 stereotype_num = 0 total_num = 0 sub_definition_num = 0 sub_none_num = 0 sub_stereotype_num = 0 sub_size = 40 sub_start = -(sub_size - sum(1 for line in open("data/Sembias"))) gender_v = embed['he'] - embed['she'] for sub_idx, l in enumerate(bias_analogy_f): l = l.strip().split() max_score = -100 for i, word_pair in enumerate(l): word_pair = word_pair.split(':') if word_pair[0] not in embed or word_pair[1] not in embed: continue pre_v = embed[word_pair[0]] - embed[word_pair[1]] score = cosine(gender_v, pre_v) if score > max_score: max_idx = i max_score = score if max_idx == 0: definition_num += 1 if sub_idx >= sub_start: sub_definition_num += 1 elif max_idx == 1 or max_idx == 2: none_num += 1 if sub_idx >= sub_start: sub_none_num += 1 elif max_idx == 3: stereotype_num += 1 if sub_idx >= sub_start: sub_stereotype_num += 1 total_num += 1 definition_acc = definition_num / total_num stereotype_acc = stereotype_num / total_num none_acc = none_num / total_num score = pd.DataFrame( [[definition_acc, stereotype_acc, none_acc]], columns=["definition_acc", "stereotype_acc", "none_acc"]) score.to_csv(makedir([save_dir, "analogy"], "{}_score.csv".format(embed_name)), index=False) return definition_acc, stereotype_acc, none_acc
def download_image(query, output_directory, image_directory): makedir(f'{output_directory}/{image_directory}') files = os.listdir(f'{output_directory}/{image_directory}') while files == []: response = google_images_download.googleimagesdownload() arguments = { "output_directory": output_directory, "image_directory": image_directory, "keywords": query, "format": "jpg", "limit": 1, # TODO: Drop exact sizing # "exact_size": "1920,1080", "size": "medium", "silent_mode": True } response.download(arguments) files = os.listdir(f'{output_directory}/{image_directory}') return f'{query}/{files[0]}'
def process_reads(out_dir, threads, qual_vals, length_vals, email, unmerged="merged_only"): param_sets = product(qual_vals, length_vals) out_dir = out_dir.rstrip('/') + '/' for param_set in param_sets: workdir = out_dir+'minqual%i_minlength%i' % param_set if path.exists(workdir): warnings.warn("skipping %s, exists" % workdir) continue makedir(workdir) cdcline = "cd %s && {mbcline} && cd .. " % workdir mbcline = metabeatcline.format(outdir=out_dir, threads=threads, trim_qual=param_set[0], trim_minlength=param_set[1], unmerged=unmerged, email=email) cline = cdcline.format(mbcline=mbcline) p = Popen(cline, shell=True, stdout=PIPE, stderr=PIPE) out, err = p.communicate() if len(out) > 0: with open(workdir + '/log', 'wt') as hndl: hndl.write(out) if len(err) > 0: print('metaBEAT STDERR') print (err) #print pass