Esempio n. 1
0
def create_dataset(params: argparse.Namespace):
    start = time.time()
    train_set, tr_shape = read_hdf(os.path.join(params.dataset_path, 'train.h5'))
    test_set, ts_shape = read_hdf(os.path.join(params.dataset_path, 'test.h5'))

    input_shape = params.tile_shape + [2, ]
    num_classes = NUM_CLASSES

    seed = 0
    size = input_shape[0]
    stride = int(input_shape[0] / 8)

    break_tiles_info = (seed, size, stride)

    train_tiles_input, train_tiles_label = view_as_window(train_set,
                                                          break_tiles_info,
                                                          tr_shape,
                                                          num_classes)
    test_tiles_input, test_tiles_label = view_as_window(test_set,
                                                        break_tiles_info,
                                                        ts_shape,
                                                        num_classes)

    utils.makedir(params.output_path)
    save_hdf(os.path.join(params.output_path, 'train.h5'), train_tiles_input, train_tiles_label)
    save_hdf(os.path.join(params.output_path, 'test.h5'), test_tiles_input, test_tiles_label)

    print(f"Total time for dataset generation: {time.time() - start}")
def create_overview(files,
                    start,
                    end,
                    num_files_per_group=8,
                    type_output='B',
                    to_mp3=False,
                    artist='Glossika',
                    album='Glossika Training',
                    prefix=''):
    if num_files_per_group == 0:
        num_files_per_group = get_num_files(len(files))
    result = []
    old_start = start
    old_end = end
    start = 0
    end = old_end - old_start + 1
    for i in range(math.ceil((end - start + 1) / num_files_per_group)):
        sub_list = files[start:min(start + num_files_per_group, end)][:]
        result = result + sub_list + sub_list
        start = start + num_files_per_group

    type_num = '1' if type_output == 'B' else '2'
    dir_name = OUTPUT_ALL + '(wav)/' + sub_directory()
    name = _get_name(prefix, type_num, dir_name, old_start, old_end)
    makedir(dir_name)
    make_track(result, name)
    convert_mp3(to_mp3, name, dir_name.replace('wav', 'mp3'), artist, album)

    print('Shuffle Files: Done')
Esempio n. 3
0
def make_pie(sim,select,phi):
  plt.figure(figsize=(2,2))
  selectors = [
    sim.model.select[name].union(sim.model.select[select])
    for name in names
  ]
  SIR = np.array([
    modelutils.taccum(sim.outputs[out],**selector).islice(t=sim.t[-1])
    for selector in selectors
  ])
  SIR = SIR / SIR.sum()
  colors = [selector.color.lighten(light) for selector,light in zip(selectors,lights)]
  labels = [sim.model.select[name].label for name in names]
  reorder = lambda x: [x[0],x[2],x[1]]
  # reorder = lambda x: [x[0],x[1]]
  plt.pie(reorder(SIR), colors=reorder(colors), startangle=90, counterclock=True )
  plt.tight_layout(pad=-1.8)
  figdir = os.path.join(config.path['figs'],'flows','phi={}'.format(phi))
  utils.makedir(figdir)
  if config.save:
    plt.savefig(os.path.join(figdir,'{}-{}.pdf'.format('flow',select,phi)),transparent=True)
  else:
    plt.show()
  plt.close()
  make_legend(labels,colors)
def train():
    # train_val_df, test_df, xray14_labels = load_from_text(cfg.data_root)
    train_val_df, test_df, xray14_labels = load_from_npy()
    train_df, valid_df = split_train_val(train_val_df, ratio=0.25)
    print('*'*40, 'tain data', '*'*40)
    describe_data(train_df, xray14_labels)
    print('*' * 40, 'val data', '*' * 40)
    describe_data(valid_df, xray14_labels)
    train_gen, val_gen = creat_tain_val_generator(train_df, valid_df, cfg.input_shape[:-1], 
                                                  batch_size=cfg.batch_size)
    model = Xception(cfg.input_shape, include_top=True, n_class=len(xray14_labels), pretrain_weights='imagenet')
    model.compile(optimizer=Adam(), loss=[focal_loss()], metrics=['binary_accuracy', 'mae'])
    log_path = os.path.join(cfg.log_dir, 'xray14_focal')
    makedir(log_path)
    weights_path = os.path.join(log_path, cfg.weights_name)
    checkpoint = ModelCheckpoint(weights_path, monitor='val_loss', 
                                 verbose=1, save_best_only=True, 
                                 mode='min', save_weights_only=True)
    callbacks = [checkpoint, LearningRateScheduler(lr_schedule)]
    train_steps = get_number_of_steps(len(train_df), cfg.batch_size)*5
    val_steps = get_number_of_steps(len(valid_df), cfg.batch_size)*2
    model.fit_generator(train_gen, epochs=cfg.epochs,
                                 steps_per_epoch=train_steps,
                                 callbacks=callbacks,
                                 validation_data=val_gen,
                                 workers=cfg.n_works,
                                 max_queue_size=cfg.n_queue,
                                 use_multiprocessing=True,
                                 validation_steps=val_steps,
                                 initial_epoch=0)
Esempio n. 5
0
    def __init__(self,
                 data_dir,
                 pickle_file,
                 class_id_to_name_map,
                 save_path_viz,
                 train=True,
                 visualize=True):
        super(VinBigDataset, self).__init__()
        self.data_dir = data_dir
        self.pickle_file = pickle_file
        self.bboxes_info = pickle.load(open(pickle_file, 'rb'))
        self.save_path_viz = None
        self.visualize = visualize
        self.train = train
        self.class_id_to_name_map = class_id_to_name_map

        if self.visualize:
            self.save_path_viz = save_path_viz
            makedir(self.save_path_viz)

        if self.train:
            self.transforms = train_transforms
            self.transforms_only_image = train_transforms_only_image
        else:
            self.transforms = test_transforms
            self.transforms_only_image = test_transforms_only_image

        self.image_paths = sorted(glob.glob(os.path.join(data_dir, '*')))
Esempio n. 6
0
def projection(embed, save_dir, embed_name):
    data = pd.read_csv("data/stereotype_list.csv")
    X = data["male"].values.tolist() + data["female"].values.tolist()
    X_words, X_emb = get_word_vectors(embed, X)

    gender_direction = embed["he"] - embed["she"]
    gender_direction = gender_direction

    project = []
    for x in X_emb:
        sim = cosine(x, gender_direction)
        project.append(sim)
    project = np.array(project)
    avg_project = np.abs(project).mean()

    orders = np.argsort(project)

    plt.figure()
    plt.scatter(project, range(len(project)), s=10)
    plt.yticks([])
    plt.xlim([-0.5, 0.5])
    plt.xlabel("Similarity")

    for i in range(5):
        plt.text(project[orders[i]], orders[i] + 0.2, X_words[orders[i]])
        plt.text(project[orders[-(i + 1)]], orders[-(i + 1)] + 0.2,
                 X_words[orders[-(i + 1)]])
    plt.savefig(
        makedir([save_dir, "projection"], "{}_plot.png".format(embed_name)))

    score = pd.DataFrame([[avg_project]], columns=["score"])
    score.to_csv(makedir([save_dir, "projection"],
                         "{}_acc.csv".format(embed_name)),
                 index=False)
    return avg_project
Esempio n. 7
0
 def get_path(self, model_path, probe_data):
     path_list = model_path.split('/')[:-1]
     path_list[1] = 'results'
     path_list += [probe_data, '']
     path = '/'.join(path_list)
     self.path = path
     makedir(self.path)
Esempio n. 8
0
def base_dpn68_mixup():
    task_name = "base_dpn68_mixup"
    makedir(os.path.join(cfg.log_dir, task_name))
    print("Task Name is ", task_name)
    log = Logger(os.path.join(cfg.log_dir, task_name + '_log.txt'), mode="a")
    log("\n\n" + '-' * 51 +
        "[START %s]" % datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
        "-" * 51 + "\n\n")
    print(cfg, file=log)
    train_loader, val_loader, test_loader, mix_loader = get_dataloader(
        mix_up=True)
    model = get_model()['dpn68'].cuda()
    criterion = get_loss()['bce'].cuda()
    optimizer = optim.SGD(model.parameters(),
                          lr=cfg.lr,
                          momentum=0.9,
                          weight_decay=1e-6)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.1)
    model = train(task_name,
                  model,
                  optimizer,
                  criterion,
                  scheduler,
                  train_loader,
                  val_loader,
                  mix_loder=mix_loader,
                  log=log)
    submission_best_loss(task_name, model, test_loader, log=log)
Esempio n. 9
0
def task4():
    task_name = "base_inception_restnet"
    print("Task Name is ", task_name)
    makedir(os.path.join(cfg.log_dir, task_name))
    log = Logger(os.path.join(cfg.log_dir, task_name + '_log.txt'), mode="a")
    log("\n\n" + '-' * 51 +
        "[START %s]" % datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
        "-" * 51 + "\n\n")
    print(cfg, file=log)
    train_loader, val_loader, test_loader = get_dataloader()
    model = get_mdoel()['inceptionresnetv2'].cuda()
    criterion = get_loss()['bce'].cuda()
    optimizer = optim.Adam(model.parameters(), lr=cfg.lr)
    milestones = [(1e-3, 0), (1e-2, 5), (1e-3, 40), (1e-4, 50), (5e-5, 60),
                  (1e-4, 70), (1e-5, 80), (5e-5, 90), (1e-6, 100)]
    scheduler = MultiStepLR(optimizer, milestones)
    model = train(task_name,
                  model,
                  optimizer,
                  criterion,
                  scheduler,
                  train_loader,
                  val_loader,
                  log=log)
    submission_best_loss(task_name, model, test_loader, log=log)
Esempio n. 10
0
def task1():
    task_name = "base_dpn62_balance"
    makedir(os.path.join(cfg.log_dir, task_name))
    log = Logger(os.path.join(cfg.log_dir, task_name + '_log.txt'), mode="a")
    log("\n\n" + '-' * 51 +
        "[START %s]" % datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
        "-" * 51 + "\n\n")
    print(cfg, file=log)
    train_loader, val_loader, test_loader = get_dataloader()
    model = DPN68()
    model.cuda()
    criterion1 = nn.BCEWithLogitsLoss().cuda()
    criterion2 = BalanceLoss().cuda()
    criterions = [criterion1, criterion2]
    optimizer = optim.SGD(model.parameters(),
                          lr=cfg.lr,
                          momentum=0.9,
                          weight_decay=1e-5)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    model = train(task_name,
                  model,
                  optimizer,
                  criterions,
                  scheduler,
                  train_loader,
                  val_loader,
                  log=log)
    submission_best_f1(task_name, model, test_loader, log=log)
Esempio n. 11
0
def fname_fig(compare, output, selector, **params):
    path = os.path.join(config.path['figs'], 'compare')
    utils.makedir(path)
    return os.path.join(
        path, '-'.join(
            [config.model, compare, output, selector] +
            ['{}={}'.format(name, value)
             for name, value in params.items()]) + '.pdf')
Esempio n. 12
0
def train():
    train_val_df, test_df, xray14_labels = load_from_text(cfg.data_root)
    train_df, val_df = split_patients_by_patient_ID(train_val_df, 4)
    print('*' * 40, 'tain data', '*' * 40)
    describe_data(train_df, xray14_labels)
    print('*' * 40, 'val data', '*' * 40)
    describe_data(val_df, xray14_labels)
    train_transformer = ImageTransformer(samplewise_normalization=True,
                                         rotation_range=10,
                                         width_shift_range=0.1,
                                         height_shift_range=0.1,
                                         shear_range=0.1,
                                         zoom_range=[0.7, 1.5],
                                         horizontal_flip=True)
    val_transformer = ImageTransformer(samplewise_normalization=True)
    train_gen = random_image_generator(train_transformer,
                                       train_val_df,
                                       cfg.input_shape[:-1],
                                       xray14_labels,
                                       batch_size=cfg.batch_size,
                                       color_mode='grayscale')
    val_gen = ImageGeneratorFromPath(val_transformer,
                                     test_df['path'],
                                     test_df['xray14_vec'],
                                     shuffle=False,
                                     target_size=cfg.input_shape[:-1],
                                     batch_size=cfg.batch_size)

    model = Xception(cfg.input_shape,
                     include_top=True,
                     n_class=len(xray14_labels),
                     pretrain_weights='imagenet')
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['binary_accuracy', 'mae'])
    log_path = os.path.join(cfg.log_dir, 'random_disease')
    makedir(log_path)
    weights_path = os.path.join(log_path, cfg.weights_name)
    checkpoint = ModelCheckpoint(weights_path,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='min',
                                 save_weights_only=True)
    callbacks = [checkpoint]
    train_steps = get_number_of_steps(len(train_df), cfg.batch_size)
    val_steps = get_number_of_steps(len(val_df), cfg.batch_size)
    model.fit_generator(train_gen,
                        epochs=cfg.epochs,
                        steps_per_epoch=train_steps,
                        callbacks=callbacks,
                        validation_data=val_gen,
                        workers=cfg.n_works,
                        max_queue_size=cfg.n_queue,
                        use_multiprocessing=True,
                        validation_steps=val_steps,
                        initial_epoch=0)
Esempio n. 13
0
def main(args):
  # assert int(args.depth_sensor) in SENSOR_PAIRS[args.thermal_sensor]
  depth_data_dir = os.path.join(args.data_root_dir, args.date, '10.233.219.'+args.sensor)
  print(depth_data_dir)
  directory = os.path.join("{}_{}".format(args.date, args.sensor))
  results_dir_root = os.path.join(args.result_root, "results{:d}_{:s}".format(TASK_SIZE, args.date))
  utils.makedir(results_dir_root)
  results_dir = os.path.join(results_dir_root, directory)
  utils.makedir(results_dir)

  ACTIONS = {
    0: 'Negative',
    1: 'Get in bed',
    2: 'Get out of bed',
    3: 'Get in chair',
    4: 'Get out of chair',
    5: 'Moving in bed',
    6: 'Walking',
    7: 'Lying on bed',
    8: 'Sitting on bed',
    9: 'Sitting in chair',
    10: 'Standing',
    11: 'Delete',
  }

# for MLHC:
# 5: turning patient
# 6: delete

  task_state = TaskState(ACTIONS, args.date, depth_data_dir)
  print("Num tasks: {}".format(task_state.num_tasks))

  job, video_state, = 1, None
  images = sorted(glob(os.path.join(depth_data_dir, '*.jpg')))
  print('# frames:', len(images))
  sys.stdout.flush()

  iid = 0
  while True:
    if job == 1:
      # Get the next task
      task_id = task_state.task_id
      video_state = VideoState(args.sensor, results_dir, task_state)

    utils.print_info(task_state, video_state)
    depth_image = video_state.get_images()

    image = depth_image

    utils.draw_info(image, task_state, video_state)

    cv2.imshow('Video', image)
    job = utils.read_key(cv2.waitKey(0), task_state, video_state)

    if job == -1:
      break
def create_overview_0(start, end, to_mp3, prefix=''):
    result = [
        'outputB/FL-%04d-B%s' % (i, '.wav') for i in range(start, end + 1)
    ]
    dir_name = OUTPUT_ALL + '(wav)/' + sub_directory()
    name = _get_name(prefix, '0', dir_name)
    makedir(dir_name)
    make_track(result, name)
    folder_name = GLOSSIKA_OVERVIEW
    convert_mp3(to_mp3, name, dir_name.replace('wav', 'mp3'), artist, album)
Esempio n. 15
0
 def __init__(self, dataset_parameters, base_csv, dataset_dirs):
     self.dataset_parameters = dataset_parameters
     self.dataset_parameters.img_shape = np.asarray(
         self.dataset_parameters.img_shape)
     self.base_dataset_dir = dataset_parameters.base_dataset_dir
     self.dataset_dirs = dataset_dirs
     self.base_csv = base_csv
     self.transformation_parameters = namedtuple(
         "Transformation", ["center", "angle", "scale", "offset"])
     utils.makedir(dataset_parameters.data_preprocessing_output_dir)
Esempio n. 16
0
def create_accent_grammar(list_of_tracks,
                          num_files_per_group,
                          num_plays,
                          num_copies=1,
                          prefix='',
                          to_mp3=False,
                          artist='Accent',
                          album='Accent Training',
                          shuffled='',
                          grammar=False):

    type_file = 'Accent' if not grammar else 'Grammar'
    artist = 'Accent' if not grammar else 'Grammar'
    album = 'Accent Training' if not grammar else 'Grammar Training'
    input_files = []

    for track in list_of_tracks:
        sub_input_files = []
        for f in sorted(os.listdir(type_file + '/' + type_file + 'EN/')):
            if not (f[-3:] == 'mp3' or f[-3:] == 'wav'): continue
            if grammar: u = f[1:4]
            else: u = f[6:9]
            if u == '%03d' % (track):
                sub_input_files.append(type_file + '/' + type_file + 'EN/' + f)
        if shuffled == "group":
            shuffle(sub_input_files)
        input_files.extend(sub_input_files)

    if shuffled == "all": shuffle(input_files)

    if prefix == '' or prefix == None:
        prefix = get_prefix(list_of_tracks, grammar)

    if num_files_per_group == 0:
        num_files_per_group = get_num_files(len(input_files))

    generate_from_list_of_files(input_files,
                                type_file + '/' + type_file + 'VN/', type_file,
                                False)
    files = [
        'output' + type_file + '/' + f.split('/')[-1][:-6] + type_file +
        f.split('/')[-1][-4:] for f in input_files
    ]
    # Shuffle files

    for copies in range(int(num_copies)):
        result = shuffle_track(files, num_plays, num_files_per_group)
        dir_name = OUTPUT_ALL + '(wav)/' + sub_directory()
        makedir(dir_name)
        name = get_name(dir_name, prefix, num_plays)
        make_track(result, name)
        convert_mp3(to_mp3, name, dir_name.replace("wav", "mp3"), artist,
                    album)

    rmtree('output' + type_file)
Esempio n. 17
0
def train():
    train_val_df = load_train_csv(cfg)
    train_df, val_df = split_train_val(train_val_df, 0.25)
    train_gen = BaseGenerator(train_df, cfg.train_dir, batch_size=cfg.batch_size,
                              aug_args=cfg.aug_args,
                              target_shape=cfg.input_shape[:2],
                              use_yellow=False)

    val_gen = BaseGenerator(val_df, cfg.train_dir, batch_size=cfg.batch_size,
                            aug_args=cfg.aug_args,
                            target_shape=(512, 512),
                            use_yellow=False)
    if n_gpus > 0:
        with tf.device('/cpu:0'):
            cpu_model = Xception(cfg.input_shape, include_top=True, n_class=len(cfg.label_names))
            model = multi_gpu_model(cpu_model, gpus=n_gpus)
    else:
        model = Xception(cfg.input_shape, include_top=True, n_class=len(cfg.label_names))

    model.compile(optimizer=Adam(1e-3), loss=roc_auc_loss,
                  metrics=['binary_accuracy', 'mae'])
    log_dir = os.path.join(cfg.log_dir, 'base_xception')
    makedir(log_dir)
    weights_path = os.path.join(log_dir, cfg.weights_file)

    checkpoint = ModelCheckpoint(weights_path, monitor='val_loss',
                                 verbose=1, save_best_only=True,
                                 mode='min', save_weights_only=True)
    
    if n_gpus > 0:
        del checkpoint
        checkpoint = MultiGPUCheckpoint(weights_path, cpu_model, monitor='val_loss')
    
    callbacks = [checkpoint]
    callbacks += [ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1, mode='min')]
    train_steps = get_number_of_steps(len(train_df), cfg.batch_size)
    val_steps   = get_number_of_steps(len(val_df), cfg.batch_size)
    model.fit_generator(train_gen, 
                        epochs=cfg.epochs,
                        steps_per_epoch=train_steps,
                        callbacks=callbacks,
                        validation_data=val_gen,
                        workers=cfg.n_works,
                        max_queue_size=cfg.n_queue,
                        use_multiprocessing=True,
                        validation_steps=val_steps,
                        initial_epoch=0)


    K.clear_session()
def create_overview_en(files, start, end, to_mp3, prefix=''):
    makedir('outputSE')
    result = []
    for f in files:
        silence_file = 'outputSE/' + f.split('/')[-1][:-4] + 's.' + f[-3:]
        create_silence_from_file(f, silence_file)
        result += [GLOSSIKA_EN + f.split('/')[-1], silence_file]

    dir_name = OUTPUT_ALL + '(wav)/' + sub_directory()
    name = _get_name(prefix, 'en', dir_name)

    makedir(dir_name)
    make_track(result, name)
    convert_mp3(to_mp3, name, dir_name.replace('wav', 'mp3'), artist, album)
def create_review(files,
                  start,
                  end,
                  num_plays,
                  num_files_per_group,
                  log=False,
                  log_tracks=0,
                  num_copies=1,
                  to_mp3=False,
                  artist='Glossika',
                  album='Glossika Training',
                  name=None):
    '''
    Combine files to make them useful for Glossika Traning
    numPlays: each track is played numPlays times
    numFilesPerTrack: the number of tracks per playlist
    start: start track number
    end: end track number
    log: set True to print debug information
    logTracks: use in debug mode
    numCopies: number of copies of output file
    toMP3: set True to convert output file to .mp3
    artist:
    album: if toMP3=True, use these values to set meta information
    '''
    makedir('outputB')
    # if shuffled == 'all': shuffle(files)

    if num_files_per_group == 0:
        num_files_per_group = get_num_files(len(input_files))

    # Shuffle files

    prefix = 'Review_%04d_%04d' % (start, end)
    if name is not None and not name == '':
        prefix = name

    for copies in range(int(num_copies)):
        result = shuffle_track(files, num_plays, num_files_per_group)
        dir_name = OUTPUT_ALL + '(wav)/' + sub_directory()
        makedir(dir_name)
        name = get_name(dir_name, prefix, num_plays)
        make_track(result, name)
        convert_mp3(to_mp3, name, dir_name.replace("wav", "mp3"), artist,
                    album)
        print_log(log, log_tracks, result)

    print('Shuffle Files: Done')
Esempio n. 20
0
def repair(X_train_mv, save_dir=None):
    mv_columns = X_train_mv.isnull().any(axis=0)
    mv_columns = list(mv_columns[mv_columns == True].index)

    repair_dict = {}

    for c in mv_columns:
        X_c = X_train_mv[c].dropna().values
        cand = set(np.linspace(min(X_c), max(X_c), 4))
        cand.add(X_c.mean())
        repair_dict[c] = sorted(list(cand))

    c1, c2 = mv_columns

    X_train_repairs = {}

    for i, v1 in enumerate(repair_dict[c1]):
        for j, v2 in enumerate(repair_dict[c2]):
            name = "{}_{}".format(i, j)
            if name == "2_2":
                name = "mean"
            imp_dict = {c1: v1, c2: v2}
            X_train_repairs[name] = X_train_mv.fillna(value=imp_dict)

    if save_dir is not None:
        for name, X_imp in X_train_repairs.items():
            X_imp.to_csv(utils.makedir([save_dir], "{}.csv".format(name)),
                         index=False)

    return X_train_repairs
Esempio n. 21
0
def base_dpn92_800_kfold(k=5, n_select=0):
    task_name = "dpn92_8_KF" + str(n_select)
    makedir(os.path.join(cfg.log_dir, task_name))
    log = Logger(os.path.join(cfg.log_dir, task_name + '_log.txt'), mode="a")
    log("\n\n" + '-' * 51 +
        "[START %s]" % datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
        "-" * 51 + "\n\n")
    print(cfg, file=log)
    train_loader, val_loader, test_loader = get_kfold_dataloader(
        k, n_select=n_select, use_extra=True, target_shape=(800, 800))
    model = get_model()['dpn92']().cuda()
    # criterion = get_loss()['bce'].cuda()
    # optimizer = optim.SGD(model.parameters(), lr=cfg.lr, momentum=0.9, weight_decay=1e-4)
    # scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.35)
    # model = train(task_name, model, optimizer, criterion, scheduler, train_loader, val_loader, log=log)
    submission_best_loss(task_name, model, test_loader, log=log)
Esempio n. 22
0
def run_cp_clean(data,
                 model,
                 n_jobs=4,
                 debug_dir=None,
                 restore=False,
                 method="cpclean",
                 sample_size=100):
    X_train_repairs = np.array(
        [data["X_train_repairs"][m] for m in data["repair_methods"]])
    cleaner = CPClean(K=model["params"]["n_neighbors"],
                      n_jobs=n_jobs,
                      random_state=1)

    debugger = Debugger(data, model, utils.makedir([debug_dir, method]))

    cleaner.fit(X_train_repairs,
                data["y_train"],
                data["X_val"],
                data["y_val"],
                gt=data["X_train_gt"],
                X_train_mean=data["X_train_repairs"]["mean"],
                debugger=debugger,
                restore=restore,
                method=method,
                sample_size=sample_size)

    val_acc = cleaner.score(data["X_val"], data["y_val"])
    test_acc = cleaner.score(data["X_test"], data["y_test"])
    cp_result = {
        "test_acc_cp": test_acc,
        "val_acc_cp": val_acc,
        "percent_clean": debugger.percent_clean
    }
    return cp_result
Esempio n. 23
0
def save_data(data_dict, info, save_dir):
    for name, data in data_dict.items():
        if isinstance(data, pd.DataFrame):
            data.to_csv(utils.makedir([save_dir], "{}.csv".format(name)),
                        index=False)

    with open(os.path.join(save_dir, 'info.json'), 'w') as f:
        json.dump(info, f, indent=4)
Esempio n. 24
0
 def save_log(self):
     columns = [
         "n_iter", "n_val", "selection", "time", "percent_cc",
         "percent_clean", "clean_val_acc", "gt_val_acc", "mean_val_acc",
         "clean_test_acc", "gt_test_acc", "mean_test_acc"
     ]
     logging_save = pd.DataFrame(self.logging, columns=columns)
     logging_save.to_csv(utils.makedir([self.debug_dir], "details.csv"),
                         index=False)
Esempio n. 25
0
def main():
    '''
    主函数, 下载视频
    :return: None
    '''
    _id = get_douyin_id()

    username = get_username(_id)
    if not username:
        return
    else:
        makedir(username)

    video_urls = get_all_video_urls(_id, 0)
    if not video_urls:
        return

    download_all_videos(video_urls, username)
Esempio n. 26
0
def upload(release = True):
    "Upload the ``project`` directory into the server"
    import time
    import os
    if release and not env.path.startswith('/'):
        result = run('pwd').split(' ')[0]
        env.path = os.path.join(result,env.path)
        
    release_name = time.strftime('%Y%m%d-%H%M%S')    
    utils.get_directories(release_name, release)
    env.tarfile = archive(release)
    # put tar package
    if release:
        utils.makedir(env.release_path)
        run('cd; mkdir %(logdir)s; mkdir %(confdir)s' % env)
        put(env.tarfile, '%(path)s' % env)
        run('cd %(release_path)s && tar zxf ../%(tarfile)s' % env)
        run('rm %(path)s/%(tarfile)s' % env)
        local('rm %(tarfile)s' % env)
Esempio n. 27
0
def download_tiles_by_xyz(out_dir: Union[str, Path], url_base: str, x_start,
                          x_end, y_start, y_end, z):
    out_dir = makedir(out_dir)

    for x in range(x_start, x_end + 1):
        for y in range(y_start, y_end + 1):
            print(x, y)
            url_tile = url_base.format(X=x, Y=y, Z=z)
            getImgFromUrl(out_dir, url_tile, x, y, z)
            time.sleep(0.005)
Esempio n. 28
0
def download_nls(locations_fn: str, out_dir_root: str, z=16):
    out_dir_root = makedir(out_dir_root)

    with open(locations_fn) as f:
        city_geos = json.load(f)
    print(list(city_geos.keys()))

    for city, geo in tqdm.tqdm(city_geos.items(), desc='city-loop'):
        xmin, xmax, ymin, ymax = geo['xmin'], geo['xmax'], geo['ymin'], geo[
            'ymax']
        z = geo.get('z', z)

        print('=' * 80)
        print('Started ', city)
        out_dir = Path(out_dir_root) / city
        out_dir = makedir(out_dir)

        url_base = ts.tile_sources[ts.NLS.name]
        download_tiles_by_lnglat(out_dir, url_base, xmin, xmax, ymin, ymax, z)
        print(f'Done {city}\n\n')
Esempio n. 29
0
def weat(embed, save_dir, embed_name):
    def association(w, M, F):
        s = 0
        for m in M:
            s += cosine(w, m) / len(M)
        for f in F:
            s -= cosine(w, f) / len(F)
        return s

    def S(X, Y, M, F):
        s = 0
        for x in X:
            s += association(x, M, F)
        for y in Y:
            s -= association(y, M, F)
        return s

    def test(X, Y, M, F):
        s0 = S(X, Y, M, F)
        np.random.seed(1)
        U = np.vstack([X, Y])
        s_hat = []
        for i in range(10000):
            idx = np.random.permutation(len(U))
            X_hat = U[idx[:len(X)]]
            Y_hat = U[idx[len(X):]]
            si = S(X_hat, Y_hat, M, F)
            s_hat.append(si)

        s_hat = np.array(s_hat)

        pvalue = (s_hat > s0).mean()
        return pvalue

    with open("data/weat.json") as f:
        data = json.load(f)

    vectors = {}
    for name, words in data.items():
        _, vectors[name] = get_word_vectors(embed, words)

    M = vectors["M"]
    F = vectors["F"]

    X = vectors["B1_X"]
    Y = vectors["B1_Y"]
    pvalues = test(X, Y, M, F)

    score = pd.DataFrame([pvalues], columns=["score"])
    score.to_csv(makedir([save_dir, "weat"],
                         "{}_score.csv".format(embed_name)),
                 index=False)
    return pvalues
Esempio n. 30
0
def analogy(embed, save_dir, embed_name):
    bias_analogy_f = open("data/Sembias")

    definition_num = 0
    none_num = 0
    stereotype_num = 0
    total_num = 0
    sub_definition_num = 0
    sub_none_num = 0
    sub_stereotype_num = 0
    sub_size = 40

    sub_start = -(sub_size - sum(1 for line in open("data/Sembias")))

    gender_v = embed['he'] - embed['she']
    for sub_idx, l in enumerate(bias_analogy_f):
        l = l.strip().split()
        max_score = -100
        for i, word_pair in enumerate(l):
            word_pair = word_pair.split(':')
            if word_pair[0] not in embed or word_pair[1] not in embed:
                continue
            pre_v = embed[word_pair[0]] - embed[word_pair[1]]
            score = cosine(gender_v, pre_v)
            if score > max_score:
                max_idx = i
                max_score = score
        if max_idx == 0:
            definition_num += 1
            if sub_idx >= sub_start:
                sub_definition_num += 1
        elif max_idx == 1 or max_idx == 2:
            none_num += 1
            if sub_idx >= sub_start:
                sub_none_num += 1
        elif max_idx == 3:
            stereotype_num += 1
            if sub_idx >= sub_start:
                sub_stereotype_num += 1
        total_num += 1

    definition_acc = definition_num / total_num
    stereotype_acc = stereotype_num / total_num
    none_acc = none_num / total_num

    score = pd.DataFrame(
        [[definition_acc, stereotype_acc, none_acc]],
        columns=["definition_acc", "stereotype_acc", "none_acc"])
    score.to_csv(makedir([save_dir, "analogy"],
                         "{}_score.csv".format(embed_name)),
                 index=False)

    return definition_acc, stereotype_acc, none_acc
Esempio n. 31
0
def download_image(query, output_directory, image_directory):
    makedir(f'{output_directory}/{image_directory}')
    files = os.listdir(f'{output_directory}/{image_directory}')

    while files == []:
        response = google_images_download.googleimagesdownload()
        arguments = {
            "output_directory": output_directory,
            "image_directory": image_directory,
            "keywords": query,
            "format": "jpg",
            "limit": 1,
            # TODO: Drop exact sizing
            #            "exact_size": "1920,1080",
            "size": "medium",
            "silent_mode": True
        }
        response.download(arguments)
        files = os.listdir(f'{output_directory}/{image_directory}')

    return f'{query}/{files[0]}'
Esempio n. 32
0
def process_reads(out_dir,
                  threads,
                  qual_vals,
                  length_vals,
                  email,
                  unmerged="merged_only"):
    
    param_sets = product(qual_vals, length_vals)

    out_dir = out_dir.rstrip('/') + '/'
    
    for param_set in param_sets:

        workdir = out_dir+'minqual%i_minlength%i' % param_set
        if path.exists(workdir):
            warnings.warn("skipping %s, exists" % workdir)
            continue
        makedir(workdir)
        cdcline = "cd %s && {mbcline} && cd .. " % workdir
        mbcline = metabeatcline.format(outdir=out_dir, 
                                       threads=threads, 
                                       trim_qual=param_set[0], 
                                       trim_minlength=param_set[1],
                                       unmerged=unmerged,
                                       email=email)
        cline = cdcline.format(mbcline=mbcline)
        p = Popen(cline, shell=True, stdout=PIPE, stderr=PIPE)
        out, err = p.communicate()
        if len(out) > 0:
            with open(workdir + '/log', 'wt') as hndl:
                hndl.write(out)
        if len(err) > 0:
            print('metaBEAT STDERR')
            print (err)
            #print
            pass