def main(): dataset, version, nbfiles, pos_tags, tfidf, args = parse_args() corpus_type = "tfidf" if tfidf else "bow" logger = init_logging(name=f'MM_{dataset}_{corpus_type}', basic=False, to_stdout=True, to_file=True) logg = logger.info if logger else print log_args(logger, args) texts, stats, nbfiles = make_texts(dataset, nbfiles, pos_tags, logg=logg) gc.collect() file_name = f'{dataset}{nbfiles if nbfiles else ""}_{version}' directory = join(LDA_PATH, version) if not exists(directory): makedirs(directory) # --- saving texts --- file_path = join(directory, f'{file_name}_texts.json') logg(f'Saving {file_path}') with open(file_path, 'w') as fp: json.dump(texts, fp, ensure_ascii=False) # --- saving stats --- file_path = join(directory, f'{file_name}_stats.json') logg(f'Saving {file_path}') with open(file_path, 'w') as fp: json.dump(stats, fp) # generate and save the dataset as bow or tfidf corpus in Matrix Market format, # including dictionary, texts (json) and some stats about corpus size (json) corpus, dictionary = texts2corpus(texts, tfidf=tfidf, filter_below=5, filter_above=0.5, logg=logg) file_name += f'_{corpus_type}' directory = join(directory, corpus_type) # --- saving corpus --- file_path = join(directory, f'{file_name}.mm') logg(f'Saving {file_path}') MmCorpus.serialize(file_path, corpus) # --- saving dictionary --- file_path = join(directory, f'{file_name}.dict') logg(f'Saving {file_path}') dictionary.save(file_path)
def main(): (dataset, version, corpus_type, metrics, params, nbtopics, topn, cores, coh, vec, weight, oop, evaluate, save, plot, args) = parse_args() # --- logging --- logger = init_logging(name=f'Reranking_{dataset}', basic=False, to_stdout=True, to_file=True) logg = logger.info log_args(logger, args) t0 = time() reranker = Reranker(dataset=dataset, version=version, corpus_type=corpus_type, params=params, nbtopics=nbtopics, nb_candidate_terms=topn, nb_top_terms=10, processes=cores, logg=logg) if coh: reranker.rerank_coherence(metrics) if vec: reranker.rerank_w2v() if weight: reranker.weight_score() if oop: reranker.oop_score() if evaluate: reranker.evaluate() if save: reranker.save_results() if plot: reranker.plot() logg(f'final shape {reranker.topic_candidates.shape}') assert len(reranker.topic_candidates) == 24975 t1 = int(time() - t0) logg(f">>> done in {t1//3600:02d}:{(t1//60)%60:02d}:{t1%60:02d} <<<") return reranker
def main(args: argparse.Namespace) -> None: """ Main method for building tf examples from individual book (.npy) files :param args: ArgumentParser-parsed arguments :return: None """ utils.log_args(args) if args.sent_per_book != -1: utils.warn("Using a max number of sentences per book") # Initialize the list of output files to write the examples to output_files = [] for i_tf_ex in range(args.num_example_files): cur_tf_file_name = "%d_TfExample.tfrecord" % i_tf_ex output_files.append(os.path.join(args.output_dir, cur_tf_file_name)) # Generate examples with tf_example_utils.WriteAsTfExample(output_files, args.vocab_file, args.max_num_tokens) as writer: generate_tf_example(args, writer)
def main(): parser = argparse.ArgumentParser() # Settings parser.add_argument('-d', '--dataset', choices=dataset_attributes.keys(), required=True) parser.add_argument('-s', '--shift_type', choices=shift_types, required=True) # Confounders parser.add_argument('-t', '--target_name') parser.add_argument('-c', '--confounder_names', nargs='+') # Resume? parser.add_argument('--resume', default=False, action='store_true') # Label shifts parser.add_argument('--minority_fraction', type=float) parser.add_argument('--imbalance_ratio', type=float) # Data parser.add_argument('--fraction', type=float, default=1.0) parser.add_argument('--root_dir', default=None) parser.add_argument('--subsample_to_minority', action='store_true', default=False) parser.add_argument('--reweight_groups', action='store_true', default=False) parser.add_argument('--augment_data', action='store_true', default=False) parser.add_argument('--val_fraction', type=float, default=0.1) # Objective parser.add_argument('--robust', default=False, action='store_true') parser.add_argument('--alpha', type=float, default=0.2) parser.add_argument('--generalization_adjustment', default="0.0") parser.add_argument('--automatic_adjustment', default=False, action='store_true') parser.add_argument('--robust_step_size', default=0.01, type=float) parser.add_argument('--use_normalized_loss', default=False, action='store_true') parser.add_argument('--btl', default=False, action='store_true') parser.add_argument('--hinge', default=False, action='store_true') # Model parser.add_argument('--model', choices=model_attributes.keys(), default='resnet50') parser.add_argument('--train_from_scratch', action='store_true', default=False) parser.add_argument('--resnet_width', type=int, default=None) # Optimization parser.add_argument('--n_epochs', type=int, default=4) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--scheduler', action='store_true', default=False) parser.add_argument('--weight_decay', type=float, default=5e-5) parser.add_argument('--gamma', type=float, default=0.1) parser.add_argument('--minimum_variational_weight', type=float, default=0) # Misc parser.add_argument('--seed', type=int, default=0) parser.add_argument('--show_progress', default=False, action='store_true') parser.add_argument('--log_dir', default='./logs') parser.add_argument('--log_every', default=50, type=int) parser.add_argument('--save_step', type=int, default=10) parser.add_argument('--save_best', action='store_true', default=False) parser.add_argument('--save_last', action='store_true', default=True) parser.add_argument('--student_width', type=int) parser.add_argument('--teacher_dir', type=str) parser.add_argument('--teacher_width', type=int) parser.add_argument('--gpu', type=str) parser.add_argument('--temp', type=str) args = parser.parse_args() gpu = args.gpu temp = args.temp check_args(args) teacher_dir = args.teacher_dir student_width = args.student_width teacher_width = args.teacher_width os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = gpu def DistillationLoss(temperature): cross_entropy = torch.nn.CrossEntropyLoss() def loss(student_logits, teacher_logits, target): last_dim = len(student_logits.shape) - 1 p_t = nn.functional.softmax(teacher_logits / temperature, dim=last_dim) log_p_s = nn.functional.log_softmax(student_logits / temperature, dim=last_dim) return cross_entropy(student_logits, target) - (p_t * log_p_s).sum( dim=last_dim).mean() * temperature**2 return loss # BERT-specific configs copied over from run_glue.py if args.model == 'bert': args.max_grad_norm = 1.0 args.adam_epsilon = 1e-8 args.warmup_steps = 0 if os.path.exists(args.log_dir) and args.resume: resume = True mode = 'a' else: resume = False mode = 'w' ## Initialize logs if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) logger = Logger(os.path.join(args.log_dir, 'log.txt'), mode) # Record args log_args(args, logger) set_seed(args.seed) print("starting prep") # Data # Test data for label_shift_step is not implemented yet test_data = None test_loader = None if args.shift_type == 'confounder': train_data, val_data, test_data = prepare_data(args, train=True) elif args.shift_type == 'label_shift_step': train_data, val_data = prepare_data(args, train=True) print("done prep") loader_kwargs = { 'batch_size': args.batch_size, 'num_workers': 16, 'pin_memory': True } train_loader = train_data.get_loader(train=True, reweight_groups=args.reweight_groups, **loader_kwargs) val_loader = val_data.get_loader(train=False, reweight_groups=None, **loader_kwargs) if test_data is not None: test_loader = test_data.get_loader(train=False, reweight_groups=None, **loader_kwargs) data = {} data['train_loader'] = train_loader data['val_loader'] = val_loader data['test_loader'] = test_loader data['train_data'] = train_data data['val_data'] = val_data data['test_data'] = test_data n_classes = train_data.n_classes log_data(data, logger) logger.flush() ## Define the objective if args.hinge: assert args.dataset in ['CelebA', 'CUB'] # Only supports binary def hinge_loss(yhat, y): # The torch loss takes in three arguments so we need to split yhat # It also expects classes in {+1.0, -1.0} whereas by default we give them in {0, 1} # Furthermore, if y = 1 it expects the first input to be higher instead of the second, # so we need to swap yhat[:, 0] and yhat[:, 1]... torch_loss = torch.nn.MarginRankingLoss(margin=1.0, reduction='none') y = (y.float() * 2.0) - 1.0 return torch_loss(yhat[:, 1], yhat[:, 0], y) criterion = hinge_loss else: criterion = torch.nn.CrossEntropyLoss(reduction='none') if resume: df = pd.read_csv(os.path.join(args.log_dir, 'test.csv')) epoch_offset = df.loc[len(df) - 1, 'epoch'] + 1 logger.write(f'starting from epoch {epoch_offset}') else: epoch_offset = 0 train_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'train.csv'), train_data.n_groups, mode=mode) val_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'val.csv'), train_data.n_groups, mode=mode) test_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'test.csv'), train_data.n_groups, mode=mode) strain_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'strain.csv'), train_data.n_groups, mode=mode) sval_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'sval.csv'), train_data.n_groups, mode=mode) stest_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'stest.csv'), train_data.n_groups, mode=mode) teacher = resnet10vw(teacher_width, num_classes=n_classes) teacher_old = torch.load(teacher_dir + "/10_model.pth") for k, m in teacher_old.named_modules(): m._non_persistent_buffers_set = set() # pytorch 1.6.0 compatability teacher.load_state_dict(teacher_old.state_dict()) teacher = teacher.to('cuda') # def DistillationLoss(temperature): # cross_entropy = torch.nn.CrossEntropyLoss() # # def loss(student_logits, teacher_logits, target): # last_dim = len(student_logits.shape) - 1 # # p_t = nn.functional.softmax(teacher_logits/temperature, dim=last_dim) # log_p_s = nn.functional.log_softmax(student_logits/temperature, dim=last_dim) # # return cross_entropy(student_logits, target) - (p_t * log_p_s).sum(dim=last_dim).mean() # # return loss distill_criterion = DistillationLoss(float(temp)) student = resnet10vw(int(student_width), num_classes=n_classes).to('cuda') #student.to(device) train(teacher, student, criterion, distill_criterion, data, logger, train_csv_logger, val_csv_logger, test_csv_logger, strain_csv_logger, sval_csv_logger, test_csv_logger, args, epoch_offset=epoch_offset) train_csv_logger.close() val_csv_logger.close() test_csv_logger.close() strain_csv_logger.close() sval_csv_logger.close() stest_csv_logger.close()
def main(): ti = time.time() args = parser.parse_args() if args.flocking_method == 'reynolds': from reynolds import Controller else: print("Wrong flocking controller specified.") sys.exit(1) from leader_controller import Leader_Controller path = os.path.join(args.optim_path, args.log_dir) timestamp = utils.log_args(path, args) logger = utils.get_logger(path, timestamp) flock = Flock(args) utils.log_init_state(logger, flock) controller_list = [] leader_controller_list = [] for drone in flock.drones: if drone.vehicle_name in flock.leader_list: controller = Leader_Controller(drone, flock.flock_list, args) leader_controller_list.append(controller) else: controller = Controller(drone, flock.flock_list, args) controller_list.append(controller) #airsim.wait_key('Press any key to takeoff') print("Taking-off") flock.take_off() #airsim.wait_key('Press any key to go to different altitudes') print("Going to different altitudes") flock.initial_altitudes() #airsim.wait_key('Press any key to start initial motion') print("Starting random motion") flock.initial_speeds() #airsim.wait_key('Press any key to start flocking') print("Now flocking") count = 0 first_drone_name = flock.drones[0].vehicle_name init_sim_time = flock.client.getMultirotorState( vehicle_name=first_drone_name).timestamp while True: for controller in controller_list: controller.step() if count % 1 == 0: flock.log_flock_kinematics(logger, count) count += 1 pygame.display.set_mode((1, 1)) pygame.event.pump() keys = pygame.key.get_pressed() if keys[K_ESCAPE]: flock.reset() break curr_sim_time = flock.client.getMultirotorState( vehicle_name=first_drone_name).timestamp if (curr_sim_time - init_sim_time) / 1e9 / 60 > args.single_sim_duration: tf = time.time() print("Real world time, ", (tf - ti) / 60) flock.reset() break
def main(): global LOGG ( topics_file, labels_file, d2v_indices_file, w2v_indices_file, d2v_path, w2v_path, use_ftx, dataset, version, corpus_type, rerank, metrics, params, nbtopics, total_num_topics, max_title_length, min_doc_length, nb_labels, print_sample, args ) = parse_args() logger = init_logging(name=f'Labeling_{dataset}', basic=False, to_stdout=True, to_file=False) log_args(logger, args) LOGG = logger.info if topics_file is not None: topics = load_topics( topics_path=topics_file, metrics=metrics, params=params, nbtopics=nbtopics, print_sample=print_sample, ) else: if rerank: topics = load('rerank', dataset, version, *params, *nbtopics, logger=logger) topics = topics.query('metric in @metrics') print(topics) else: topics = load('topics', dataset, version, corpus_type, *params, *nbtopics, logger=logger) d2v_docvecs, d2v_wv, w2v_wv = load_embeddings( d2v_path=d2v_path, w2v_path=w2v_path, use_ftx=use_ftx, ) if d2v_indices_file and w2v_indices_file: with open(d2v_indices_file, 'rb') as fp: LOGG(f'Loading {d2v_indices_file}') d2v_indices = pickle.load(fp) with open(w2v_indices_file, 'rb') as fp: LOGG(f'Loading {w2v_indices_file}') w2v_indices = pickle.load(fp) else: d2v_indices, w2v_indices = get_indices( d2v_docvecs=d2v_docvecs, w2v_wv=w2v_wv, max_title_length=max_title_length, min_doc_length=min_doc_length ) d2v_indices = sorted(set(d2v_indices)) w2v_indices = sorted(set(w2v_indices)) w2v_indexed = index_embeddings( d2v_docvecs=d2v_docvecs, d2v_wv=d2v_wv, w2v_wv=w2v_wv, d2v_indices=d2v_indices, w2v_indices=w2v_indices ) t0 = time() labels = topics[:total_num_topics].apply( lambda row: get_labels( topic=row, nb_labels=nb_labels, d2v_docvecs=d2v_docvecs, d2v_wv=d2v_wv, w2v_wv=w2v_wv, w2v_indexed=w2v_indexed, d_indices=d2v_indices, w_indices=w2v_indices ), axis=1 ) t1 = int(time() - t0) LOGG(f"done in {t1//3600:02d}:{(t1//60) % 60:02d}:{t1 % 60:02d}") if print_sample: LOGG(f'\n{labels.head(10)}') # reformatting output files col2 = 'ftx' if use_ftx else 'w2v' col3 = 'comb_ftx' if use_ftx else 'comb' labels = ( labels .apply(pd.Series) .rename(columns={0: 'd2v', 1: col2, 2: col3}) .stack() .apply(pd.Series) .rename(columns=lambda x: f'label{x}') ) if print_sample: LOGG(f'\n{labels.head(10)}') if exists(labels_file + '.csv'): labels_file = labels_file + '_' + str(time()) + '.csv' else: labels_file += '.csv' LOGG(f'Writing labels to {labels_file}') labels.to_csv(labels_file)
def main(): # --- arguments --- (dataset, version, _, _, nbs_topics, _, _, cache_in_memory, use_callbacks, tfidf, args) = parse_args() model_class = 'LSImodel' _split_ = "_split" if use_callbacks else "" data_name = f'{dataset}_{version}_{tfidf}' data_dir = join(LDA_PATH, version, tfidf) # --- logging --- logger = init_logging(name=data_name, basic=False, to_stdout=True, to_file=True) logg = logger.info log_args(logger, args) # --- load dict --- logg('Loading dictionary') data_file = join(data_dir, f'{data_name}.dict') dictionary = Dictionary.load(data_file) # --- load corpus --- logg('Loading corpus') data_file = join(data_dir, f'{data_name}.mm') corpus = MmCorpus(data_file) if cache_in_memory: logg('Reading corpus into RAM') corpus = list(corpus) if use_callbacks: train, test = split_corpus(corpus) else: train, test = corpus, [] logg(f'size of... train_set={len(train)}, test_set={len(test)}') # --- train --- topn = 20 columns = [f'term{x}' for x in range(topn)] + [f'weight{x}' for x in range(topn)] for nbtopics in nbs_topics: gc.collect() logg(f'Running {model_class} with {nbtopics} topics') model = LsiModel(corpus=train, num_topics=nbtopics, id2word=dictionary) model_dir = join(LSI_PATH, version, tfidf, f'{_split_}') model_path = join(model_dir, f'{dataset}_{model_class}{_split_}_{nbtopics}') if not exists(model_dir): makedirs(model_dir) # --- save topics --- topics = model.show_topics(num_words=topn, formatted=False) topics = [list(chain(*zip(*topic[1]))) for topic in topics] topics = pd.DataFrame(topics, columns=columns) logg(f'Saving topics to {model_path}.csv') topics.to_csv(f'{model_path}.csv') # --- save model --- logg(f'Saving model to {model_path}') model.save(model_path) # --- done --- logg(f'\n' f'----- end -----\n' f'----- {dataset.upper()} -----\n' f'{"#" * 50}\n')
def main(): (dataset, version, params, nbtopics, topn, cores, corpus_type, use_coherence, use_w2v, rerank, lsi, args) = parse_args() logger = init_logging(name=f'Eval_topics_{dataset}', basic=False, to_stdout=True, to_file=True) log_args(logger, args) logg = logger.info purpose = 'rerank' if rerank else 'topics' topics = load(purpose, dataset, version, corpus_type, lsi, *params, *nbtopics, logg=logg) if topn > 0: topics = topics[:topn] else: topn = topics.shape[1] logg(f'number of topics: {topics.shape}') unique_topics = topics.drop_duplicates() logg(f'number of unique topics: {unique_topics.shape}') wiki_dict = load('dict', 'dewiki', 'unfiltered', logg=logg) dfs = [] if use_coherence: dictionary = load('dict', dataset, version, corpus_type, logg=logg) corpus = load('corpus', dataset, version, corpus_type, logg=logg) texts = load('texts', dataset, version, logg=logg) df = eval_coherence( topics=unique_topics, dictionary=dictionary, corpus=corpus, texts=texts, keyed_vectors=None, metrics=None, window_size=None, suffix='', cores=cores, logg=logg, topn=topn, ) del dictionary, corpus, texts gc.collect() dfs.append(df) wiki_texts = load('texts', 'dewiki', logg=logg) df = eval_coherence( topics=unique_topics, dictionary=wiki_dict, corpus=None, texts=wiki_texts, keyed_vectors=None, metrics=None, window_size=None, suffix='_wikt', cores=cores, logg=logg, topn=topn, ) gc.collect() dfs.append(df) df = eval_coherence( unique_topics, wiki_dict, corpus=None, texts=wiki_texts, keyed_vectors=None, metrics=['c_uci'], window_size=20, suffix='_wikt_w20', cores=cores, logg=logg, topn=topn, ) del wiki_texts gc.collect() dfs.append(df) df_sims = None if use_w2v: d2v = load('d2v', logg=logg).docvecs w2v = load('w2v', logg=logg).wv ftx = load('ftx', logg=logg).wv # Dry run to make sure both indices are fully in RAM d2v.init_sims() _ = d2v.vectors_docs_norm[0] w2v.init_sims() _ = w2v.vectors_norm[0] ftx.init_sims() _ = ftx.vectors_norm[0] df = eval_coherence( topics=unique_topics, dictionary=wiki_dict, corpus=None, texts=None, keyed_vectors=w2v, metrics=None, window_size=None, suffix='_w2v', cores=cores, logg=logger.info, topn=topn, ) gc.collect() dfs.append(df) df = eval_coherence( topics=unique_topics, dictionary=wiki_dict, corpus=None, texts=None, keyed_vectors=ftx, metrics=None, window_size=None, suffix='_ftx', cores=cores, logg=logger.info, topn=topn, ) gc.collect() dfs.append(df) # apply custom similarity metrics kvs = {'d2v': d2v, 'w2v': w2v, 'ftx': ftx} ms = unique_topics.apply(lambda x: mean_similarity(x, kvs), axis=1) ps = unique_topics.apply( lambda x: pairwise_similarity(x, kvs, ignore_oov=True), axis=1) ps2 = unique_topics.apply( lambda x: pairwise_similarity(x, kvs, ignore_oov=False), axis=1) df_sims = pd.concat( { 'mean_similarity': ms, 'pairwise_similarity_ignore_oov': ps, 'pairwise_similarity': ps2 }, axis=1) del d2v, w2v, ftx gc.collect() dfs = pd.concat(dfs, axis=1) dfs = dfs.stack().apply(pd.Series).rename(columns={ 0: 'score', 1: 'stdev', 2: 'support' }).unstack() if df_sims is not None: dfs = pd.concat([dfs, df_sims], axis=1) # restore scores for all topics from results of unique topics topics.columns = pd.MultiIndex.from_tuples([('terms', t) for t in list(topics.columns)]) topic_columns = list(topics.columns) fillna = lambda grp: grp.fillna(method='ffill') if len(grp) > 1 else grp dfs = (topics.join(dfs).groupby(topic_columns).apply(fillna).drop( topic_columns, axis=1)) tpx_path = join(LDA_PATH, version, 'bow', 'topics') if rerank: file = join(tpx_path, f'{dataset}_reranker-eval.csv') else: file = join( tpx_path, f'{dataset}{"_"+lsi if lsi else ""}_{version}_{corpus_type}_topic-scores.csv' ) if exists(file): file = file.replace('.csv', f'_{str(time()).split(".")[0]}.csv') logg(f'Writing {file}') dfs.to_csv(file) logg('done') return dfs
def main(): parser = argparse.ArgumentParser() # Settings parser.add_argument('-d', '--dataset', choices=dataset_attributes.keys(), required=True) parser.add_argument('-s', '--shift_type', choices=shift_types, required=True) # Confounders parser.add_argument('-t', '--target_name') parser.add_argument('-c', '--confounder_names', nargs='+') # Resume? parser.add_argument('--resume', default=False, action='store_true') # Label shifts parser.add_argument('--minority_fraction', type=float) parser.add_argument('--imbalance_ratio', type=float) # Data parser.add_argument('--fraction', type=float, default=1.0) parser.add_argument('--root_dir', default=None) parser.add_argument('--subsample_to_minority', action='store_true', default=False) parser.add_argument('--reweight_groups', action='store_true', default=False) parser.add_argument('--augment_data', action='store_true', default=False) parser.add_argument('--val_fraction', type=float, default=0.1) # Objective parser.add_argument('--robust', default=False, action='store_true') parser.add_argument('--alpha', type=float, default=0.2) parser.add_argument('--generalization_adjustment', default="0.0") parser.add_argument('--automatic_adjustment', default=False, action='store_true') parser.add_argument('--robust_step_size', default=0.01, type=float) parser.add_argument('--use_normalized_loss', default=False, action='store_true') parser.add_argument('--btl', default=False, action='store_true') parser.add_argument('--hinge', default=False, action='store_true') # Model parser.add_argument('--model', choices=model_attributes.keys(), default='resnet50') parser.add_argument('--train_from_scratch', action='store_true', default=False) parser.add_argument('--resnet_width', type=int, default=None) # Optimization parser.add_argument('--n_epochs', type=int, default=4) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--scheduler', action='store_true', default=False) parser.add_argument('--weight_decay', type=float, default=5e-5) parser.add_argument('--gamma', type=float, default=0.1) parser.add_argument('--minimum_variational_weight', type=float, default=0) # Misc parser.add_argument('--seed', type=int, default=0) parser.add_argument('--show_progress', default=False, action='store_true') parser.add_argument('--log_dir', default='./logs') parser.add_argument('--log_every', default=50, type=int) parser.add_argument('--save_step', type=int, default=10) parser.add_argument('--save_best', action='store_true', default=False) parser.add_argument('--save_last', action='store_true', default=False) parser.add_argument('--model_test', type=str) parser.add_argument('--gpu', type=str) args = parser.parse_args() check_args(args) model_test = args.model_test gpu = args.gpu os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = gpu # BERT-specific configs copied over from run_glue.py if args.model == 'bert': args.max_grad_norm = 1.0 args.adam_epsilon = 1e-8 args.warmup_steps = 0 if os.path.exists(args.log_dir) and args.resume: resume = True mode = 'a' else: resume = False mode = 'w' ## Initialize logs if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) logger = Logger(os.path.join(args.log_dir, model_test + '_log.txt'), mode) # Record args log_args(args, logger) set_seed(args.seed) # Data # Test data for label_shift_step is not implemented yet test_data = None test_loader = None if args.shift_type == 'confounder': train_data, val_data, test_data = prepare_data(args, train=True) elif args.shift_type == 'label_shift_step': train_data, val_data = prepare_data(args, train=True) loader_kwargs = { 'batch_size': args.batch_size, 'num_workers': 12, 'pin_memory': True } train_loader = train_data.get_loader(train=True, reweight_groups=args.reweight_groups, **loader_kwargs) val_loader = val_data.get_loader(train=False, reweight_groups=None, **loader_kwargs) if test_data is not None: test_loader = test_data.get_loader(train=False, reweight_groups=None, **loader_kwargs) data = {} data['train_loader'] = train_loader data['val_loader'] = val_loader data['test_loader'] = test_loader data['train_data'] = train_data data['val_data'] = val_data data['test_data'] = test_data n_classes = train_data.n_classes log_data(data, logger) ## Initialize model pretrained = not args.train_from_scratch if resume: model = torch.load(os.path.join(args.log_dir, model_test)) d = train_data.input_size()[0] elif model_attributes[args.model]['feature_type'] in ('precomputed', 'raw_flattened'): assert pretrained # Load precomputed features d = train_data.input_size()[0] model = nn.Linear(d, n_classes) model.has_aux_logits = False elif args.model == 'resnet50': model = torchvision.models.resnet50(pretrained=pretrained) d = model.fc.in_features model.fc = nn.Linear(d, n_classes) elif args.model == 'resnet34': model = torchvision.models.resnet34(pretrained=pretrained) d = model.fc.in_features model.fc = nn.Linear(d, n_classes) elif args.model == 'wideresnet50': model = torchvision.models.wide_resnet50_2(pretrained=pretrained) d = model.fc.in_features model.fc = nn.Linear(d, n_classes) elif args.model == 'resnet50vw': assert not pretrained assert args.resnet_width is not None model = resnet50vw(args.resnet_width, num_classes=n_classes) elif args.model == 'resnet18vw': assert not pretrained assert args.resnet_width is not None model = resnet18vw(args.resnet_width, num_classes=n_classes) elif args.model == 'resnet10vw': assert not pretrained assert args.resnet_width is not None model = resnet10vw(args.resnet_width, num_classes=n_classes) elif args.model == 'bert': assert args.dataset == 'MultiNLI' from pytorch_transformers import BertConfig, BertForSequenceClassification config_class = BertConfig model_class = BertForSequenceClassification config = config_class.from_pretrained('bert-base-uncased', num_labels=3, finetuning_task='mnli') model = model_class.from_pretrained('bert-base-uncased', from_tf=False, config=config) else: raise ValueError('Model not recognized.') logger.flush() ## Define the objective if args.hinge: assert args.dataset in ['CelebA', 'CUB'] # Only supports binary def hinge_loss(yhat, y): # The torch loss takes in three arguments so we need to split yhat # It also expects classes in {+1.0, -1.0} whereas by default we give them in {0, 1} # Furthermore, if y = 1 it expects the first input to be higher instead of the second, # so we need to swap yhat[:, 0] and yhat[:, 1]... torch_loss = torch.nn.MarginRankingLoss(margin=1.0, reduction='none') y = (y.float() * 2.0) - 1.0 return torch_loss(yhat[:, 1], yhat[:, 0], y) criterion = hinge_loss else: criterion = torch.nn.CrossEntropyLoss(reduction='none') if False: df = pd.read_csv(os.path.join(args.log_dir, 'test.csv')) epoch_offset = df.loc[len(df) - 1, 'epoch'] + 1 logger.write(f'starting from epoch {epoch_offset}') else: epoch_offset = 0 train_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'train.csv'), train_data.n_groups, mode=mode) val_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'val.csv'), train_data.n_groups, mode=mode) test_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'test.csv'), train_data.n_groups, mode=mode) train(model, criterion, data, logger, train_csv_logger, val_csv_logger, test_csv_logger, args, epoch_offset=epoch_offset) train_csv_logger.close() val_csv_logger.close() test_csv_logger.close()
def main(): args = argparser.parse_args() log_args(args) input_dir = args.input_dir output_dir = args.output_dir base_model_dir = args.base_model_dir image_size = args.image_size crop_images = args.crop_images augment = args.augment use_progressive_image_sizes = args.use_progressive_image_sizes progressive_image_size_min = args.progressive_image_size_min progressive_image_size_step = args.progressive_image_size_step progressive_image_epoch_step = args.progressive_image_epoch_step batch_size = args.batch_size batch_iterations = args.batch_iterations num_workers = args.num_workers pin_memory = args.pin_memory epochs_to_train = args.epochs lr_scheduler_type = args.lr_scheduler lr_patience = args.lr_patience lr_min = args.lr_min lr_max = args.lr_max lr_min_decay = args.lr_min_decay lr_max_decay = args.lr_max_decay optimizer_type = args.optimizer loss_type = args.loss focal_loss_gamma = args.focal_loss_gamma use_class_weights = args.use_class_weights use_weighted_sampling = args.use_weighted_sampling model_type = args.model patience = args.patience sgdr_cycle_epochs = args.sgdr_cycle_epochs sgdr_cycle_epochs_mult = args.sgdr_cycle_epochs_mult sgdr_cycle_end_prolongation = args.sgdr_cycle_end_prolongation sgdr_cycle_end_patience = args.sgdr_cycle_end_patience max_sgdr_cycles = args.max_sgdr_cycles if optimizer_type == "adam": lr_scheduler_type = "adam" progressive_image_sizes = list( range(progressive_image_size_min, image_size + 1, progressive_image_size_step)) train_data = TrainData(input_dir) train_set = TrainDataset(train_data.train_set_df, input_dir, 28, image_size, crop_images, augment) balance_weights, balance_class_weights = calculate_balance_weights( train_data.df, train_data.train_set_df, 28) train_set_sampler = WeightedRandomSampler(balance_weights, len(balance_weights)) train_set_data_loader = DataLoader( train_set, batch_size=batch_size, shuffle=False if use_weighted_sampling else True, sampler=train_set_sampler if use_weighted_sampling else None, num_workers=num_workers, pin_memory=pin_memory) val_set = TrainDataset(train_data.val_set_df, input_dir, 28, image_size, crop_images, False) val_set_data_loader = \ DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory) if base_model_dir: for base_file_path in glob.glob("{}/*.pth".format(base_model_dir)): shutil.copyfile( base_file_path, "{}/{}".format(output_dir, os.path.basename(base_file_path))) model = create_model(type=model_type, num_classes=28).to(device) model.load_state_dict( torch.load("{}/model.pth".format(output_dir), map_location=device)) optimizer = create_optimizer(optimizer_type, model, lr_max) if os.path.isfile("{}/optimizer.pth".format(output_dir)): try: optimizer.load_state_dict( torch.load("{}/optimizer.pth".format(output_dir))) adjust_initial_learning_rate(optimizer, lr_max) adjust_learning_rate(optimizer, lr_max) except: log("Failed to load the optimizer weights") else: model = create_model(type=model_type, num_classes=28).to(device) optimizer = create_optimizer(optimizer_type, model, lr_max) torch.save(model.state_dict(), "{}/model.pth".format(output_dir)) ensemble_model_index = 0 for model_file_path in glob.glob("{}/model-*.pth".format(output_dir)): model_file_name = os.path.basename(model_file_path) model_index = int( model_file_name.replace("model-", "").replace(".pth", "")) ensemble_model_index = max(ensemble_model_index, model_index + 1) epoch_iterations = ceil(len(train_set) / batch_size) log("train_set_samples: {}, val_set_samples: {}".format( len(train_set), len(val_set))) log() global_val_score_best_avg = float("-inf") sgdr_cycle_val_score_best_avg = float("-inf") lr_scheduler = CosineAnnealingLR(optimizer, T_max=sgdr_cycle_epochs, eta_min=lr_min) optim_summary_writer = SummaryWriter( log_dir="{}/logs/optim".format(output_dir)) train_summary_writer = SummaryWriter( log_dir="{}/logs/train".format(output_dir)) val_summary_writer = SummaryWriter( log_dir="{}/logs/val".format(output_dir)) current_sgdr_cycle_epochs = sgdr_cycle_epochs sgdr_next_cycle_end_epoch = current_sgdr_cycle_epochs + sgdr_cycle_end_prolongation sgdr_iterations = 0 sgdr_cycle_count = 0 batch_count = 0 epoch_of_last_improval = 0 lr_scheduler_plateau = \ ReduceLROnPlateau(optimizer, mode="max", min_lr=lr_min, patience=lr_patience, factor=0.5, threshold=1e-4) lr_scheduler_step = StepLR(optimizer, step_size=10, gamma=0.1) log('{"chart": "best_val_score", "axis": "epoch"}') log('{"chart": "val_score", "axis": "epoch"}') log('{"chart": "val_loss", "axis": "epoch"}') log('{"chart": "sgdr_cycle", "axis": "epoch"}') log('{"chart": "score", "axis": "epoch"}') log('{"chart": "loss", "axis": "epoch"}') log('{"chart": "lr_scaled", "axis": "epoch"}') log('{"chart": "mem_used", "axis": "epoch"}') log('{"chart": "epoch_time", "axis": "epoch"}') train_start_time = time.time() loss_weight = CLASS_WEIGHTS_TENSOR if use_class_weights else None criterion = create_criterion(loss_type, loss_weight, focal_loss_gamma) for epoch in range(epochs_to_train): epoch_start_time = time.time() log("memory used: {:.2f} GB".format(psutil.virtual_memory().used / 2**30)) if use_progressive_image_sizes: next_image_size = \ progressive_image_sizes[min(epoch // progressive_image_epoch_step, len(progressive_image_sizes) - 1)] if train_set.image_size != next_image_size: log("changing image size to {}".format(next_image_size)) train_set.image_size = next_image_size val_set.image_size = next_image_size model.train() train_loss_sum_t = zero_item_tensor() epoch_batch_iter_count = 0 if lr_scheduler_type == "lr_finder": new_lr = lr_max * 0.5**(sgdr_cycle_epochs - min( sgdr_cycle_epochs, sgdr_iterations / epoch_iterations)) adjust_learning_rate(optimizer, new_lr) all_predictions = [] all_targets = [] for b, batch in enumerate(train_set_data_loader): images, categories = \ batch[0].to(device, non_blocking=True), \ batch[1].to(device, non_blocking=True) if lr_scheduler_type == "cosine_annealing": lr_scheduler.step( epoch=min(current_sgdr_cycle_epochs, sgdr_iterations / epoch_iterations)) if b % batch_iterations == 0: optimizer.zero_grad() prediction_logits = model(images) criterion.weight = CLASS_WEIGHTS_TENSOR loss = criterion(prediction_logits, categories) loss.backward() with torch.no_grad(): train_loss_sum_t += loss all_predictions.extend( torch.sigmoid(prediction_logits).cpu().data.numpy()) all_targets.extend(categories.cpu().data.numpy()) if (b + 1) % batch_iterations == 0 or ( b + 1) == len(train_set_data_loader): optimizer.step() sgdr_iterations += 1 batch_count += 1 epoch_batch_iter_count += 1 optim_summary_writer.add_scalar("lr", get_learning_rate(optimizer), batch_count + 1) train_loss_avg = train_loss_sum_t.item() / epoch_batch_iter_count train_score_avg = f1_score_from_probs(torch.tensor(all_predictions), torch.tensor(all_targets)) val_loss_avg, val_score_avg = evaluate(model, val_set_data_loader, criterion) if lr_scheduler_type == "reduce_on_plateau": lr_scheduler_plateau.step(val_score_avg) elif lr_scheduler_type == "step": lr_scheduler_step.step(epoch) model_improved_within_sgdr_cycle = check_model_improved( sgdr_cycle_val_score_best_avg, val_score_avg) if model_improved_within_sgdr_cycle: torch.save( model.state_dict(), "{}/model-{}.pth".format(output_dir, ensemble_model_index)) sgdr_cycle_val_score_best_avg = val_score_avg model_improved = check_model_improved(global_val_score_best_avg, val_score_avg) ckpt_saved = False if model_improved: torch.save(model.state_dict(), "{}/model.pth".format(output_dir)) torch.save(optimizer.state_dict(), "{}/optimizer.pth".format(output_dir)) np.save("{}/train_predictions.npy".format(output_dir), all_predictions) np.save("{}/train_targets.npy".format(output_dir), all_targets) global_val_score_best_avg = val_score_avg epoch_of_last_improval = epoch ckpt_saved = True sgdr_reset = False if (lr_scheduler_type == "cosine_annealing") \ and (epoch + 1 >= sgdr_next_cycle_end_epoch) \ and (epoch - epoch_of_last_improval >= sgdr_cycle_end_patience): sgdr_iterations = 0 current_sgdr_cycle_epochs = int(current_sgdr_cycle_epochs * sgdr_cycle_epochs_mult) sgdr_next_cycle_end_epoch = epoch + 1 + current_sgdr_cycle_epochs + sgdr_cycle_end_prolongation ensemble_model_index += 1 sgdr_cycle_val_score_best_avg = float("-inf") sgdr_cycle_count += 1 sgdr_reset = True new_lr_min = lr_min * (lr_min_decay**sgdr_cycle_count) new_lr_max = lr_max * (lr_max_decay**sgdr_cycle_count) new_lr_max = max(new_lr_max, new_lr_min) adjust_learning_rate(optimizer, new_lr_max) lr_scheduler = CosineAnnealingLR(optimizer, T_max=current_sgdr_cycle_epochs, eta_min=new_lr_min) optim_summary_writer.add_scalar("sgdr_cycle", sgdr_cycle_count, epoch + 1) train_summary_writer.add_scalar("loss", train_loss_avg, epoch + 1) train_summary_writer.add_scalar("score", train_score_avg, epoch + 1) val_summary_writer.add_scalar("loss", val_loss_avg, epoch + 1) val_summary_writer.add_scalar("score", val_score_avg, epoch + 1) epoch_end_time = time.time() epoch_duration_time = epoch_end_time - epoch_start_time log("[%03d/%03d] %ds, lr: %.6f, loss: %.4f, val_loss: %.4f, score: %.4f, val_score: %.4f, ckpt: %d, rst: %d" % (epoch + 1, epochs_to_train, epoch_duration_time, get_learning_rate(optimizer), train_loss_avg, val_loss_avg, train_score_avg, val_score_avg, int(ckpt_saved), int(sgdr_reset))) log('{"chart": "best_val_score", "x": %d, "y": %.4f}' % (epoch + 1, global_val_score_best_avg)) log('{"chart": "val_loss", "x": %d, "y": %.4f}' % (epoch + 1, val_loss_avg)) log('{"chart": "val_score", "x": %d, "y": %.4f}' % (epoch + 1, val_score_avg)) log('{"chart": "sgdr_cycle", "x": %d, "y": %d}' % (epoch + 1, sgdr_cycle_count)) log('{"chart": "loss", "x": %d, "y": %.4f}' % (epoch + 1, train_loss_avg)) log('{"chart": "score", "x": %d, "y": %.4f}' % (epoch + 1, train_score_avg)) log('{"chart": "lr_scaled", "x": %d, "y": %.4f}' % (epoch + 1, 1000 * get_learning_rate(optimizer))) log('{"chart": "mem_used", "x": %d, "y": %.2f}' % (epoch + 1, psutil.virtual_memory().used / 2**30)) log('{"chart": "epoch_time", "x": %d, "y": %d}' % (epoch + 1, epoch_duration_time)) if (sgdr_reset or lr_scheduler_type in ("reduce_on_plateau", "step")) \ and epoch - epoch_of_last_improval >= patience: log("early abort due to lack of improval") break if max_sgdr_cycles is not None and sgdr_cycle_count >= max_sgdr_cycles: log("early abort due to maximum number of sgdr cycles reached") break optim_summary_writer.close() train_summary_writer.close() val_summary_writer.close() train_end_time = time.time() log() log("Train time: %s" % str(datetime.timedelta(seconds=train_end_time - train_start_time))) model.load_state_dict( torch.load("{}/model.pth".format(output_dir), map_location=device)) val_predictions, val_targets = predict(model, val_set_data_loader) np.save("{}/val_predictions.npy".format(output_dir), val_predictions) np.save("{}/val_targets.npy".format(output_dir), val_targets) best_threshold, best_threshold_score, all_threshold_scores = calculate_best_threshold( val_predictions, val_targets) log("All threshold scores: {}".format(all_threshold_scores)) log("Best threshold / score: {} / {}".format(best_threshold, best_threshold_score)) test_data = TestData(input_dir) test_set = TestDataset(test_data.test_set_df, input_dir, image_size, crop_images) test_set_data_loader = \ DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory) test_predictions, _ = predict(model, test_set_data_loader) np.save("{}/test_predictions.npy".format(output_dir), test_predictions) predicted_categories = calculate_categories_from_predictions( test_predictions, threshold=best_threshold) submission_df = test_data.test_set_df.copy() submission_df["Predicted"] = [ " ".join(map(str, pc)) for pc in predicted_categories ] submission_df.to_csv("{}/submission.csv".format(output_dir))
def main(args): if args.wandb: wandb.init(project=f"{args.project_name}_{args.dataset}") wandb.config.update(args) # BERT-specific configs copied over from run_glue.py if (args.model.startswith("bert") and args.use_bert_params): args.max_grad_norm = 1.0 args.adam_epsilon = 1e-8 args.warmup_steps = 0 if os.path.exists(args.log_dir) and args.resume: resume = True mode = "a" else: resume = False mode = "w" ## Initialize logs if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) logger = Logger(os.path.join(args.log_dir, "log.txt"), mode) # Record args log_args(args, logger) set_seed(args.seed) # Data # Test data for label_shift_step is not implemented yet test_data = None test_loader = None if args.shift_type == "confounder": train_data, val_data, test_data = prepare_data( args, train=True, ) elif args.shift_type == "label_shift_step": raise NotImplementedError train_data, val_data = prepare_data(args, train=True) ######################################################################### ###################### Prepare data for our method ###################### ######################################################################### # Should probably not be upweighting if folds are specified. assert not args.fold or not args.up_weight # Fold passed. Use it as train and valid. if args.fold: train_data, val_data = folds.get_fold( train_data, args.fold, cross_validation_ratio=(1 / args.num_folds_per_sweep), num_valid_per_point=args.num_sweeps, seed=args.seed, ) if args.up_weight != 0: assert args.aug_col is not None # Get points that should be upsampled metadata_df = pd.read_csv(args.metadata_path) if args.dataset == "jigsaw": train_col = metadata_df[metadata_df["split"] == "train"] else: train_col = metadata_df[metadata_df["split"] == 0] aug_indices = np.where(train_col[args.aug_col] == 1)[0] print("len", len(train_col), len(aug_indices)) if args.up_weight == -1: up_weight_factor = int( (len(train_col) - len(aug_indices)) / len(aug_indices)) - 1 else: up_weight_factor = args.up_weight print(f"Up-weight factor: {up_weight_factor}") upsampled_points = Subset(train_data, list(aug_indices) * up_weight_factor) # Convert to DRODataset train_data = dro_dataset.DRODataset( ConcatDataset([train_data, upsampled_points]), process_item_fn=None, n_groups=train_data.n_groups, n_classes=train_data.n_classes, group_str_fn=train_data.group_str, ) elif args.aug_col is not None: print("\n"*2 + "WARNING: aug_col is not being used." + "\n"*2) ######################################################################### ######################################################################### ######################################################################### loader_kwargs = { "batch_size": args.batch_size, "num_workers": 4, "pin_memory": True, } train_loader = dro_dataset.get_loader(train_data, train=True, reweight_groups=args.reweight_groups, **loader_kwargs) val_loader = dro_dataset.get_loader(val_data, train=False, reweight_groups=None, **loader_kwargs) if test_data is not None: test_loader = dro_dataset.get_loader(test_data, train=False, reweight_groups=None, **loader_kwargs) data = {} data["train_loader"] = train_loader data["val_loader"] = val_loader data["test_loader"] = test_loader data["train_data"] = train_data data["val_data"] = val_data data["test_data"] = test_data n_classes = train_data.n_classes log_data(data, logger) ## Initialize model model = get_model( model=args.model, pretrained=not args.train_from_scratch, resume=resume, n_classes=train_data.n_classes, dataset=args.dataset, log_dir=args.log_dir, ) if args.wandb: wandb.watch(model) logger.flush() ## Define the objective if args.hinge: assert args.dataset in ["CelebA", "CUB"] # Only supports binary criterion = hinge_loss else: criterion = torch.nn.CrossEntropyLoss(reduction="none") if resume: raise NotImplementedError # Check this implementation. df = pd.read_csv(os.path.join(args.log_dir, "test.csv")) epoch_offset = df.loc[len(df) - 1, "epoch"] + 1 logger.write(f"starting from epoch {epoch_offset}") else: epoch_offset = 0 train_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, f"train.csv"), train_data.n_groups, mode=mode) val_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, f"val.csv"), val_data.n_groups, mode=mode) test_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, f"test.csv"), test_data.n_groups, mode=mode) train( model, criterion, data, logger, train_csv_logger, val_csv_logger, test_csv_logger, args, epoch_offset=epoch_offset, csv_name=args.fold, wandb=wandb if args.wandb else None, ) train_csv_logger.close() val_csv_logger.close() test_csv_logger.close()
parser.add_argument("--mode", "-m", choices=VALID_MODES, required=True) parser.add_argument("--verbosity", "-v", type=int, default=int(logging.INFO), help=""" Verbosity levels in python: NOTSET = 0 DEBUG = 10 INFO = 20 WARNING = 30 WARN = WARNING ERROR = 40 CRITICAL = 50 FATAL = CRITICAL """) # TODO: add verbosity so we know the book being filtered args = parser.parse_args() # generate_plaintext_corpus(args) logging.basicConfig(format='%(message)s') utils.log_args(args) logging.getLogger().setLevel(args.verbosity) """ TODO: - apache spark? - add some form of logging for each book """ generate_textid_corpus(args)
def main(): (dataset, version, params, nbtopics, topn, cores, corpus_type, use_coherence, use_w2v, rerank, lsi, args) = parse_args() logger = init_logging(name=f'Eval_topics_on_germanet_{dataset}', basic=False, to_stdout=True, to_file=True) log_args(logger, args) logg = logger.info purpose = 'rerank' if rerank else 'topics' topics = load(purpose, dataset, version, corpus_type, lsi, *params, *nbtopics) if topn > 0: topics = topics[:topn] else: topn = topics.shape[1] logg(f'Number of topics {topics.shape}') logg('Getting SynSets for topic terms') sstopics = topics.applymap(gn.synsets) topics['lch'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.sim_lch, agg_func=max, topn=topn) topics['lch_ignr_unkwn'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.sim_lch, agg_func=max, topn=topn, ignore_unknown=False) topics['res'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.sim_res, agg_func=max, topn=topn) topics['res_ignr_unkwn'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.sim_res, agg_func=max, topn=topn, ignore_unknown=False) topics['jcn'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.dist_jcn, agg_func=min, topn=topn) topics['jcn_ignr_unkwn'] = sstopics.progress_apply( similarities, axis=1, sim_func=Synset.dist_jcn, agg_func=min, topn=topn, ignore_unknown=False) topics['lin'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.sim_lin, agg_func=max, topn=topn) topics['lin_ignr_unkwn'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.sim_lin, agg_func=max, topn=topn, ignore_unknown=False) topics = topics.iloc[:, topn:] tpx_path = join(LDA_PATH, version, 'bow', 'topics') if rerank: file = join(tpx_path, f'{dataset}_reranker-eval_germanet.csv') else: file = join( tpx_path, f'{dataset}{"_"+lsi if lsi else ""}_{version}_{corpus_type}_topic-scores_germanet.csv' ) if exists(file): file = file.replace('.csv', f'_{str(time()).split(".")[0]}.csv') logg(f'Writing {file}') topics.to_csv(file) logg('done')
def main(): global LOGG # --- arguments --- (dataset, version, cb_logger, params, nbs_topics, epochs, cores, cache_in_memory, use_callbacks, corpus_type, args) = parse_args() model_class = 'LDAmodel' _split = "_split" if use_callbacks else "" # --- logging --- logger = init_logging( name=f'LDA_{dataset}_{version}_{corpus_type}{_split}_ep{epochs}', basic=False, to_stdout=True, to_file=True) LOGG = logger.info log_args(logger, args) # --- load texts --- if use_callbacks: texts = load(dataset, version, 'texts', logger=logger) else: texts = [] # --- load dict --- dictionary = load(dataset, version, corpus_type, 'dict', logger=logger) # --- load corpus --- corpus = load(dataset, version, corpus_type, 'corpus', logger=logger) if cache_in_memory: LOGG('Reading corpus into RAM') corpus = list(corpus) if use_callbacks: train, test = split_corpus(corpus) else: train, test = corpus, [] LOGG(f'size of... train_set={len(train)}, test_set={len(test)}') # --- enable visdom --- vis = None if cb_logger == 'visdom': try: import visdom vis = visdom.Visdom() except Exception as e: LOGG(e) cb_logger = 'shell' # --- train --- topn = 20 columns = [f'term{x}' for x in range(topn)] + [f'weight{x}' for x in range(topn)] metrics = [] for param in params: env_id = f"{dataset}-{model_class}" for nbtopics in nbs_topics: gc.collect() callbacks = init_callbacks(dataset=dataset, callback_logger=cb_logger, documents=texts, training_corpus=train, test_corpus=test, processes=cores, version=version, param=param, nbtopics=nbtopics, tfidf=corpus_type) if not use_callbacks: callbacks = callbacks[-1:] else: LOGG('Initializing Callbacks') kwargs = get_parameterset(train, dictionary, callbacks=callbacks, nbtopics=nbtopics, parametrization=param, epochs=epochs) LOGG( f'Running {model_class} {corpus_type} "{param}{_split}" with {nbtopics} topics' ) model = LdaModel(**kwargs) gc.collect() model_dir = join(LDA_PATH, version, corpus_type, f'{param}{_split}') model_path = join( model_dir, f'{dataset}_LDAmodel_{param}{_split}_{nbtopics}_ep{epochs}') if not exists(model_dir): makedirs(model_dir) # --- save topics --- topics = model.show_topics(num_topics=-1, num_words=topn, formatted=False) topics = [list(chain(*zip(*topic[1]))) for topic in topics] topics = pd.DataFrame(topics, columns=columns) LOGG(f'Saving topics to {model_path}.csv') topics.to_csv(f'{model_path}.csv') # --- save metrics --- current_metrics = model.metrics metrics.append(('env_id', current_metrics)) with open(f'{model_path}_metrics.json', 'w') as fp: serializable_metrics = {} for k, v in current_metrics.items(): if k == dataset: continue if isinstance(v[0], np.ndarray): serializable_metrics[k] = [x.tolist() for x in v] else: serializable_metrics[k] = [float(x) for x in v] LOGG(f'Saving metrics to {model_path}_metrics.json') json.dump(serializable_metrics, fp) # --- save model --- LOGG(f'Saving LDAmodel to {model_path}') model.callbacks = None model.save(model_path) # --- save visdom environment --- if vis is not None: vis.save([env_id]) gc.collect() # --- done --- LOGG(f'\n' f'----- end -----\n' f'----- {dataset.upper()} -----\n' f'{"#" * 50}\n')
action='store_true', help='Call nn.DataParallel on model or not') parser.add_argument('--num_neg', default=None, type=int) args = parser.parse_args() assert args.num_neg is not None if args.model == 'resnet10vw': assert args.width is not None set_seed(args.seed) model_path, batch_size, epochs = args.model_path, args.batch_size, args.epochs log_level = logging.INFO if args.verbose else logging.DEBUG logger = utils.get_logger( name=__name__, filename=args.logpath, console_log_level=log_level) # default we log everything to console log_args(args, logger) logger.info("Loading Data") train_data = STL10(root='data', split='train', transform=utils.train_transform) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True) test_data = STL10(root='data', split='test', transform=utils.test_transform) test_loader = DataLoader(test_data, batch_size=batch_size,
def main(): # --- argument parsing --- ( model_name, epochs, min_count, cores, checkpoint_every, cache_in_memory, lowercase, fasttext, args ) = parse_args(default_model_name='w2v_default', default_epochs=100) # --- init logging --- logger = init_logging(name=model_name, basic=True, to_file=True, to_stdout=False) log_args(logger, args) input_dir = join(SMPL_PATH, 'dewiki') model_dir = join(EMB_PATH, model_name) if not exists(model_dir): makedirs(model_dir) logger.info('model dir: ' + model_dir) t0 = time() if cache_in_memory: # needs approx. 25GB of RAM logger.info('cache data in memory') sentences = [s for s in Sentences(input_dir, logger, lowercase=lowercase)] else: sentences = Sentences(input_dir, logger, use_file_cache=True, lowercase=lowercase) gc.collect() # Model initialization logger.info('Initializing new model') if fasttext: model = FastText( size=300, window=5, min_count=min_count, sample=1e-5, negative=5, sg=1, seed=42, iter=epochs, workers=cores, min_n=3, max_n=6, ) else: model = Word2Vec( size=300, window=5, min_count=min_count, sample=1e-5, negative=5, sg=1, seed=42, iter=epochs, workers=cores, ) logger.info('Building vocab') model.build_vocab(sentences, progress_per=100_000) # Model Training epoch_saver = EpochSaver(model_name, model_dir, checkpoint_every) epoch_logger = EpochLogger(logger) logger.info('Training {:d} epochs'.format(epochs)) model.train( sentences, total_examples=model.corpus_count, epochs=model.epochs, report_delay=60, callbacks=[epoch_logger, epoch_saver], ) # saving model file_path = join(model_dir, model_name) logger.info('Writing model to ' + file_path) model.callbacks = () model.save(file_path) t1 = int(time() - t0) logger.info("all done in {:02d}:{:02d}:{:02d}".format(t1//3600, (t1//60) % 60, t1 % 60))
def init_logging(): logger.info("Representativeness experiments running ...") logger.info("python3 " + " ".join(sys.argv)) log_args(params)
def main(): # --- argument parsing --- (model_name, epochs, min_count, cores, checkpoint_every, cache_in_memory, lowercase, _, args) = parse_args(default_model_name='d2v', default_epochs=20) # --- init logging --- logger = init_logging(name=model_name, basic=True, to_file=True, to_stdout=False) log_args(logger, args) input_dir = join(SMPL_PATH, 'dewiki') model_dir = join(EMB_PATH, model_name) if not exists(model_dir): makedirs(model_dir) logger.info('model dir: ' + model_dir) t0 = time() documents = Documents(input_dir=input_dir, logger=logger, lowercase=lowercase) if cache_in_memory: documents = list(documents) gc.collect() # Model initialization logger.info('Initializing new model') model = Doc2Vec( vector_size=300, window=15, min_count=20, sample=1e-5, negative=5, hs=0, dm=0, dbow_words=1, dm_concat=0, seed=42, epochs=epochs, workers=cores, ) logger.info('Building vocab') model.build_vocab(documents) # Model Training epoch_saver = EpochSaver(model_name, model_dir, checkpoint_every) epoch_logger = EpochLogger(logger) logger.info('Training {:d} epochs'.format(epochs)) model.train( documents, total_examples=model.corpus_count, epochs=model.epochs, report_delay=60, callbacks=[epoch_logger, epoch_saver], ) # saving model file_path = join(model_dir, model_name) logger.info('Writing model to ' + file_path) model.callbacks = () model.save(file_path) t1 = int(time() - t0) logger.info("all done in {:02d}:{:02d}:{:02d}".format( t1 // 3600, (t1 // 60) % 60, t1 % 60))
def main(): args = make_parser().parse_args() log_args(args) global_step = tf.get_variable('global_step', initializer=0, trainable=False) training = tf.get_variable('training', initializer=False, trainable=False) with tf.name_scope('data_loading'): train_ds, test_ds = cifar10.make_dataset(args.dataset_path) train_ds, test_ds = (train_ds.shuffle(args.shuffle).batch( args.batch_size), test_ds.batch(args.batch_size)) iter = tf.data.Iterator.from_structure((tf.float32, tf.int64), ((None, 32, 32, 3), (None))) x, y = iter.get_next() logits = densenet.densenet(x, block_depth=args.block_depth, growth_rate=args.growth_rate, compression_factor=args.compression_factor, bottleneck=True, dropout=args.dropout, weight_decay=args.weight_decay, training=training) loss, update_loss = metrics.loss(logits=logits, labels=y) accuracy, update_accuracy = metrics.accuracy(logits=logits, labels=y) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): class_loss = objectives.loss(logits=logits, labels=y, top_k=args.hard_negatives) reg_loss = tf.losses.get_regularization_loss() train_step = tf.train.AdamOptimizer(args.learning_rate).minimize( class_loss + reg_loss, global_step=global_step) locals_init = tf.local_variables_initializer() train_init = tf.group(training.assign(True), iter.make_initializer(train_ds), locals_init) test_init = tf.group(training.assign(False), iter.make_initializer(test_ds), locals_init) with tf.name_scope('summary'): tf.summary.scalar('loss', loss) tf.summary.scalar('accuracy', accuracy) merged = tf.summary.merge_all() saver = tf.train.Saver() with tf.Session() as sess, tf.summary.FileWriter( os.path.join(args.experiment_path, 'train'), sess.graph) as train_writer, tf.summary.FileWriter( os.path.join(args.experiment_path, 'test'), sess.graph) as test_writer: restore_path = tf.train.latest_checkpoint(args.experiment_path) if restore_path: print(warning('Restoring from checkpoint')) saver.restore(sess, restore_path) else: print(warning('Initializing')) sess.run(tf.global_variables_initializer()) for epoch in range(args.epochs): sess.run(train_init) for _ in tqdm(count(), desc='training'): try: _, step = sess.run([(train_step, update_loss, update_accuracy), global_step]) except tf.errors.OutOfRangeError: break print(success('epoch: {}, step: {}'.format(epoch, step))) l, a, summary = sess.run([loss, accuracy, merged]) print( success('(train) loss: {:.4f}, accuracy: {:.2f}'.format( l, a * 100))) train_writer.add_summary(summary, step) train_writer.flush() sess.run(test_init) for _ in tqdm(count(), desc='evaluation'): try: _, step = sess.run([(update_loss, update_accuracy), global_step]) except tf.errors.OutOfRangeError: break l, a, summary = sess.run([loss, accuracy, merged]) print( success('(test) loss: {:.4f}, accuracy: {:.2f}'.format( l, a * 100))) test_writer.add_summary(summary, step) test_writer.flush() save_path = saver.save( sess, os.path.join(args.experiment_path, 'model.ckpt')) print(warning('model saved: {}'.format(save_path)))
def main(): print("Loading and checking args...") args = parse_args() check_args(args) # BERT-specific configs copied over from run_glue.py if args.model.startswith('bert'): args.max_grad_norm = 1.0 args.adam_epsilon = 1e-8 args.warmup_steps = 0 #Write for logging; assumes no existing logs. mode = 'w' ## Initialize logs if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) logger = Logger(os.path.join(args.log_dir, 'log.txt'), mode) # Record args log_args(args, logger) set_seed(args.seed) # Data print("Preparing data") train_data, val_data, test_data = prepare_data(args, train=True) print("Setting up loader") loader_kwargs = { 'batch_size': args.batch_size, 'num_workers': 4, 'pin_memory': True } train_loader = train_data.get_loader(train=True, reweight_groups=args.reweight_groups, **loader_kwargs) val_loader = val_data.get_loader(train=False, reweight_groups=None, **loader_kwargs) test_loader = test_data.get_loader(train=False, reweight_groups=None, **loader_kwargs) data = {} data['train_loader'] = train_loader data['val_loader'] = val_loader data['test_loader'] = test_loader data['train_data'] = train_data data['val_data'] = val_data data['test_data'] = test_data n_classes = train_data.n_classes log_data(data, logger) ## Initialize model if args.model == 'resnet50': model = torchvision.models.resnet50(pretrained=True) d = model.fc.in_features model.fc = nn.Linear(d, n_classes) if args.mc_dropout: model = add_dropout(model, 'fc') elif args.model == 'densenet121': model = torchvision.models.densenet121(pretrained=True) d = model.classifier.in_features model.classifier = nn.Linear(d, n_classes) if args.mc_dropout: model = add_dropout(model, 'classifier') elif args.model == 'bert-base-uncased': print("Loading bert") model = BertForSequenceClassification.from_pretrained( args.model, num_labels=n_classes) else: raise ValueError('Model not recognized.') logger.flush() criterion = torch.nn.CrossEntropyLoss(reduction='none') print("Getting loggers") train_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'train.csv'), train_data.n_groups, mode=mode) val_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'val.csv'), train_data.n_groups, mode=mode) test_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'test.csv'), train_data.n_groups, mode=mode) print("Starting to train...") train(model, criterion, data, logger, train_csv_logger, val_csv_logger, test_csv_logger, args, epoch_offset=0, train=True) train_csv_logger.close() val_csv_logger.close() test_csv_logger.close() if args.save_preds: save_preds(model, data, args) return