def filter3(): dir3 = "%s/FSUBTEST/3" % params.JADER_OUT utils.ensure_dir(dir3) dDrug1Se = dict() dDrug2Se = dict() fin1 = open("%s/FSUBTEST/1/1.txt" % params.JADER_OUT) fin2 = open("%s/FSUBTEST/2/2.txt" % params.JADER_OUT) while True: line = fin1.readline() if line == "": break parts = line.strip().split("\t") drug = parts[0] ses = set(parts[1].split(",")) dDrug1Se[drug] = ses fin1.close() while True: line = fin2.readline() if line == "": break parts = line.strip().split("\t") drug = parts[0] ses = set(parts[1].split(",")) dDrug2Se[drug] = ses fin1.close() fin = open("%s/SUB/3" % params.JADER_OUT) fout = open("%s/SUB/F3" % params.JADER_OUT, "w") while True: line = fin.readline() if line == "": break line = line.strip() parts = line.split("$") dDrug = parts[0].split(",") ses = parts[1].split(",") invalidSes = set() for drug in dDrug: sD = utils.get_dict(dDrug1Se, drug, set()) for s in sD: invalidSes.add(s) drugS = sorted(dDrug) drugPairs = [] for i in range(len(drugS)): for j in range(i + 1, len(drugS)): pair = "%s,%s" % (drugS[i], drugS[j]) drugPairs.append(pair) for pair in drugPairs: sD = utils.get_dict(dDrug2Se, pair, set()) for s in sD: invalidSes.add(s) validSes = [] for se in ses: # if se not in invalidSes: validSes.append(se) fout.write("%s$%s\n" % (parts[0], ",".join(validSes))) fout.close()
def builder(): args = _get_parser() check_file(args.infile) ensure_dir(args.output) A = ahocorasick.Automaton() origin, annotation = list(), list() infile = open(args.infile, 'r', encoding='utf-8') for line in infile: line = line.rstrip() if not line: continue phrase, means = line.split(':::') if not phrase or not means: continue origin.append(phrase) annotation.append(means) infile.close() assert len(origin) == len(annotation) for idx, phrase in enumerate(origin): A.add_word(phrase, (idx, phrase)) A.make_automaton() ac_name = os.path.join(args.output, args.ac_name) means = os.path.join(args.output, args.mean_name) with open(ac_name, 'wb') as outfile: pickle.dump(A, outfile, protocol=pickle.HIGHEST_PROTOCOL) with open(means, 'wb') as outfile: pickle.dump(annotation, outfile, protocol=pickle.HIGHEST_PROTOCOL)
def __init__(self, model="FFNN"): resetRandomSeed() self.data = None utils.ensure_dir("%s/logs" % params.C_DIR) PREX = "FDA" if model == "FFNN": self.model = FFNNModel() elif model == "MILI": self.model = MILIModel() elif model == "MIL": self.model = MILModel() logPath = "%s/logs/%s_%s_%s" % (params.C_DIR, PREX, self.model.name, utils.getCurrentTimeString()) self.logger = MyLogger(logPath) self.model.setLogger(self.logger) self.logger.infoAll((self.model.name)) # self.logger.infoAll(self.model.model.named_parameters()) self.logger.infoAll( ("LAYERS, EMBEDDING_SIZE, WEIGHT, BATCH_SIZE", params.N_LAYER, params.EMBEDDING_SIZE, params.WEIGHT_ZERO, params.BATCH_SIZE)) self.logger.infoAll(("NCHANELS, DK, MAX DRUG: ", params.N_CHANEL, params.DK, params.MAX_N_DRUG))
def generate_tfrecords(self): utils.ensure_dir(self.outputDir) utils.ensure_dir(self.outputImagePathTR) utils.ensure_dir(self.outputImagePathVL) utils.ensure_dir(self.outputimagePathTS) num_of_images_for_train_and_val=self.PATIENT_END_TRAINING-self.PATIENT_START_TRAINING+1 images_for_validation=int(utils.percentage(self.percent_for_validation,num_of_images_for_train_and_val)) #Get random list of files for validation. validation_list = random.sample(range(self.PATIENT_START_TRAINING, self.PATIENT_END_TRAINING+1),images_for_validation) #train and validation for i,j in enumerate(list(range(self.PATIENT_START_TRAINING, self.PATIENT_END_TRAINING+1))): InputFileVolume=os.path.join(self.imagePathTR,"BRATS_%03d.nii.gz" % (j)) InputFileLabel=os.path.join(self.labelPathTR,"BRATS_%03d.nii.gz" % (j)) if os.path.isfile(InputFileVolume) and os.path.isfile(InputFileLabel): if j in validation_list: self.generate_tfrecord_from_patient(InputFileVolume,InputFileLabel,False,True,False) else: self.generate_tfrecord_from_patient(InputFileVolume,InputFileLabel,True,False,False) #test for i,j in enumerate(list(range(self.PATIENT_START_TEST, self.PATIENT_END_TEST+1))): InputFileVolume=os.path.join(self.imagePathTS,"BRATS_%03d.nii.gz" % (j)) if os.path.isfile(InputFileVolume) and os.path.isfile(InputFileLabel): self.generate_tfrecord_from_patient(InputFileVolume,"",False,False,True)
def init_logger(log_name, log_dir): """ 日志模块 1. 同时将日志打印到屏幕跟文件中 2. 默认值保留近30天日志文件 """ ensure_dir(log_dir) if log_name not in Logger.manager.loggerDict: logger = logging.getLogger(log_name) logger.setLevel(logging.DEBUG) handler = TimedRotatingFileHandler( filename=os.path.join(log_dir, "%s.log" % log_name), when="D", backupCount=30, ) datefmt = "%Y-%m-%d %H:%M:%S" format_str = "[%(asctime)s]: %(name)s %(filename)s[line:%(lineno)s] %(levelname)s %(message)s" formatter = logging.Formatter(format_str, datefmt) handler.setFormatter(formatter) handler.setLevel(logging.INFO) logger.addHandler(handler) console = logging.StreamHandler() console.setLevel(logging.INFO) console.setFormatter(formatter) logger.addHandler(console) handler = TimedRotatingFileHandler( filename=os.path.join(log_dir, "ERROR.log"), when="D", backupCount=30, ) datefmt = "%Y-%m-%d %H:%M:%S" format_str = "[%(asctime)s]: %(name)s %(filename)s[line:%(lineno)s] %(levelname)s %(message)s" formatter = logging.Formatter(format_str, datefmt) handler.setFormatter(formatter) handler.setLevel(logging.ERROR) logger.addHandler(handler) logger = logging.getLogger(log_name) return logger
def filterg2(): dir2 = "%s/FSUBTEST/2" % params.JADER_OUT utils.ensure_dir(dir2) dDrug1Se = dict() fin = open("%s/FSUBTEST/1/1.txt" % params.JADER_OUT) while True: line = fin.readline() if line == "": break parts = line.strip().split("\t") drug = parts[0] ses = set(parts[1].split(",")) dDrug1Se[drug] = ses fin.close() fin = open("%s/SUB/G2" % params.JADER_OUT) fout = open("%s/SUB/GF2" % params.JADER_OUT, "w") while True: line = fin.readline() if line == "": break line = line.strip() parts = line.split("$") dDrug = parts[0].split(",") ses = parts[1].split(",") invalidSes = set() for drug in dDrug: sD = utils.get_dict(dDrug1Se, drug, set()) for s in sD: invalidSes.add(s) validSes = [] for se in ses: if se not in invalidSes: validSes.append(se) fout.write("%s$%s\n" % (parts[0], ",".join(validSes))) fout.close()
def __init__( self, checkpoint_dir, monitor, logger, arch, save_best_only=True, best_model_name=None, epoch_model_name=None, mode="min", epoch_freq=1, best=None, ): self.monitor = monitor self.checkpoint_dir = checkpoint_dir self.save_best_only = save_best_only self.epoch_freq = epoch_freq self.arch = arch self.logger = logger self.best_model_name = best_model_name self.epoch_model_name = epoch_model_name self.use = "on_epoch_end" self.default_model_name = "pytorch_model.bin" if mode == "min": self.monitor_op = np.less self.best = np.Inf elif mode == "max": self.monitor_op = np.greater self.best = -np.Inf if best: self.best = best ensure_dir(self.checkpoint_dir)
pin_memory=True ) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=10, shuffle=False, num_workers=4, worker_init_fn=lambda id: utils.set_seed(seed+id) ) # %% if args.arch not in ['resnet18', 'resnet50']: print(f'Unkown arch {args.arch}') exit(0) name = f'{args.arch}-unprocessed/{args.pretrain}/{args.train}' utils.ensure_dir(f'logs/{vars.corda_version}/{name}') utils.ensure_dir(f'models/{vars.corda_version}/{args.arch}-unprocessed/{args.pretrain}') train_df.to_csv(f'logs/{vars.corda_version}/{name}/train.csv', index=False) val_df.to_csv(f'logs/{vars.corda_version}/{name}/val.csv', index=False) test_df.to_csv(f'logs/{vars.corda_version}/{name}/test.csv', index=False) with open(f'logs/{vars.corda_version}/{name}/stats.txt', 'w') as f: f.write(f'Mean, std: {mean}, {std}\n') f.write(f'LR: {args.lr}, epochs: {args.epochs}\n') f.write(f'CORDA dataset size: {len(corda_df)} \n\n') train_cov_size = [ len(train_df[train_df.covid == 0]), len(train_df[train_df.covid == 1]) ]
def ensureDIR(): utils.ensure_dir("%s/FSUBTEST" % params.JADER_OUT) utils.ensure_dir("%s/FSUBTEST/1" % params.JADER_OUT) utils.ensure_dir("%s/FSUBTEST/2" % params.JADER_OUT) utils.ensure_dir("%s/SUB" % params.JADER_OUT)
def main(): args = _get_parser() # preliminary work check_file(args.infile) ensure_dir(args.output) if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load stop words stop_words = get_stop_words(args.stop_words) if os.path.exists( args.stop_words) else list() # load tokenizer seg = Segmentor(args) print('Splitting sentence into different clusters ...') infile = open(args.infile, 'r', encoding="utf-8") for line in tqdm(infile): line = line.rstrip() is_match = False seg_list = list(seg.cut(line)) if stop_words: seg_list = list(filter(lambda x: x not in stop_words, seg_list)) for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: bucket_path = os.path.join(args.output, bucket) check_file(bucket_path) selected = sample_file(bucket_path, args.sample_number) selected = list(map(lambda x: list(seg.cut(x)), selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: sen = list(filter(lambda x: x not in stop_words, sen)) filt_selected.append(sen) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_path = os.path.join(args.output, bucket_name) with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 infile.close() # sort and rename file file_list = os.listdir(args.output) file_list = list(filter(lambda x: x.startswith('tmp'), file_list)) cnt = dict() for file in file_list: file_path = os.path.join(args.output, file) cnt[file] = line_counter(file_path) sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True) for idx, (file_name, times) in enumerate(sorted_cnt): origin_path = os.path.join(args.output, file_name) new_path = os.path.join(args.output, id_name.format(idx)) os.rename(origin_path, new_path) print('All is well')
def main(argv=None): config = SafeConfigParser() config.read(cmd_args.config_path) if cmd_args.restore_checkpoint: print('Skipping training phase, loading model checkpoint from: ', config.get('main', 'checkpoint_path')) # Get the data. train_data_filename = utils.maybe_download(config, config.get('data', 'train_data_filename')) train_labels_filename = utils.maybe_download(config, config.get('data', 'train_labels_filename')) test_data_filename = utils.maybe_download(config, config.get('data', 'test_data_filename')) test_labels_filename = utils.maybe_download(config, config.get('data', 'test_labels_filename')) # Extract it into np arrays. train_data = utils.extract_data(config, train_data_filename, 60000) train_labels = utils.extract_labels(train_labels_filename, 60000) test_data = utils.extract_data(config, test_data_filename, 10000) test_labels = utils.extract_labels(test_labels_filename, 10000) validation_size = config.getint('main', 'validation_size') num_epochs = config.getint('main', 'num_epochs') # Generate a validation set. validation_data = train_data[:validation_size, ...] validation_labels = train_labels[:validation_size] train_data = train_data[validation_size:, ...] train_labels = train_labels[validation_size:] num_epochs = num_epochs train_size = train_labels.shape[0] lenet5 = LeNet5(config) x, y_ = lenet5.train_input_placeholders() y_conv, logits, keep_prob, param_dict = lenet5.model(x) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y_)) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(param_dict['fc1_W']) + tf.nn.l2_loss(param_dict['fc1_b']) + tf.nn.l2_loss(param_dict['fc2_W']) + tf.nn.l2_loss(param_dict['fc2_b'])) # Add the regularization term to the loss. loss += 5e-4 * regularizers # Optimizer: set up a variable that's incremented once # per batch and controls the learning rate decay. batch = tf.Variable(0, dtype=tf.float32) # Decay once per epoch, using an exponential schedule starting at 0.01. learning_rate = tf.train.exponential_decay( 0.01, batch * config.getint('main', 'batch_size'), train_size, 0.95, staircase=True) optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9) \ .minimize(loss, global_step=batch) input_dict = { "x": x, "y_": y_, "y_conv": y_conv, "keep_prob": keep_prob, "train_data": train_data, "train_labels": train_labels, "test_data": test_data, "test_labels": test_labels, "validation_data": validation_data, "validation_labels": validation_labels, "num_epochs": num_epochs, "train_size": train_size } saver = tf.train.Saver(tf.all_variables()) evaluator = Evaluator(cmd_args, config, optimizer, learning_rate, loss, saver) evaluator.run(input_dict) fastgradientsign_advgen = FastGradientSign_AdvGen(cmd_args, [1, 28, 28, 1], saver, config) adv_out_df = fastgradientsign_advgen.run(input_dict) pkl_path = config.get('main', 'pickle_filepath') utils.ensure_dir(os.path.dirname(pkl_path)) with open(pkl_path, "wb") as pkl: pickle.dump(adv_out_df, pkl)
import logging import os parser = argparse.ArgumentParser() parser.add_argument('--n_epochs', type=int, default=50) parser.add_argument('--save_dir', type=str, default='saved/') parser.add_argument('--batch', type=int, default=128) parser.add_argument('--nemb', type=int, default=22) parser.add_argument('--method', type=str, default='acgan') parser.add_argument('--glr', type=float, default=0.0002) parser.add_argument('--dlr', type=float, default=0.0002) parser.add_argument('--nf', type=int, default=64) parser.add_argument('--checkpoint', type=str, default=None) opt = parser.parse_args() utils.ensure_dir(opt.save_dir) handlers = [ logging.FileHandler(os.path.join(opt.save_dir, 'output.log'), mode='w'), logging.StreamHandler() ] logging.basicConfig(handlers=handlers, level=logging.INFO, format='') logger = logging.getLogger() NOISE_DIM = 100 NF = opt.nf N_EMB = opt.nemb if __name__ == '__main__': L = DataLoader(data_dir='data/', n_emb=N_EMB,
def main(): global connection, cursor cpu = multiprocessing.cpu_count() print("CPU {}".format(cpu)) # preliminary work check_file(args.infile) ensure_dir(args.output) all_lines = 0 if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work all_bucked = defaultdict(list) p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load tokenizer print('Splitting sentence into different clusters ...') infile = open(args.infile, 'r', encoding="utf-8") i = 0 all_data = infile.readlines() n = 10000 # 大列表中几个数据组成一个小列表 lstgs = [all_data[i:i + n] for i in range(0, len(all_data), n)] print(len(lstgs)) r = [] tr = [] pool = multiprocessing.Pool(processes=4) for xyz in lstgs: tr.append(pool.apply_async(fenci, (xyz, ))) pool.close() pool.join() for res in tr: tmp = res.get() for z in tmp: if z not in jieba_cache.keys(): jieba_cache[z] = tmp[z] else: print(z) for st in stop_words: stop_words_cache[st] = 1 r.clear() r = None all_lines = len(jieba_cache) print("开始执行 总 {} 行".format(all_lines)) print("缓存成功jieba {}".format(len(jieba_cache))) print("缓存成功停用词 {}".format(len(stop_words_cache))) all_data = jieba_cache.keys() for inline in all_data: if inline == '太原去贵阳怎么走': print("") i = i + 1 print("当前第 {} 行----总 {}".format(i, all_lines)) inline = inline.rstrip() line = inline.split(':::')[0] is_match = False seg_list = jieba_cache[line] llll = [] if stop_words: for mmmm in seg_list: if mmmm not in stop_words_cache.keys(): llll.append(mmmm) seg_list = llll for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: array = all_bucked[bucket] selected = sample_dict(array, args.sample_number) selected = list(map(lambda x: x.split(':::')[0], selected)) selected = list(map(lambda x: jieba_cache[x], selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: llll = [] for mmmm in sen: if mmmm not in stop_words_cache.keys(): llll.append(mmmm) filt_selected.append(llll) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True all_bucked[bucket].append(line) for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break # print("{} jaccard耗时 {}".format( inline, endtime - starttime)) if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_array = [line] all_bucked[bucket_name] = bucket_array for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 infile.close() batch_size = 0 for zzzz in all_bucked: batch_size = batch_size + 1 connection = pymysql.connect(host='47.99.87.74', user='******', password='******', db='august', port=33306) cursor = connection.cursor() all_bucked_data = [] for zx in all_bucked[zzzz]: all_bucked_data.append([all_bucked[zzzz][0], zx, today]) print("当前批次 {} 共 {}".format(batch_size, len(all_bucked))) cursor.executemany( "insert into 凤巢长尾词分组(group_id,keyword,created_date) values(%s,%s,%s)", (all_bucked_data)) connection.commit() cursor.close() connection.close() print('All is well')
def run(self, questions): args = self._get_parser() # preliminary work ensure_dir(args.output) if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load stop words stop_words = get_stop_words(args.stop_words) if os.path.exists( args.stop_words) else list() # load tokenizer seg = Segmentor(args) print('Splitting sentence into different clusters ...') infile = questions for inline in tqdm(infile): inline = inline.rstrip() line = inline.split(':::')[0] is_match = False seg_list = list(seg.cut(line)) if stop_words: seg_list = list(filter(lambda x: x not in stop_words, seg_list)) for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: bucket_path = os.path.join(args.output, bucket) check_file(bucket_path) selected = sample_file(bucket_path, args.sample_number) selected = list(map(lambda x: x.split(':::')[0], selected)) selected = list(map(lambda x: list(seg.cut(x)), selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: sen = list( filter(lambda x: x not in stop_words, sen)) filt_selected.append(sen) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_path = os.path.join(args.output, bucket_name) with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 # sort and rename file file_list = os.listdir(args.output) file_list = list(filter(lambda x: x.startswith('tmp'), file_list)) cnt = dict() for file in file_list: file_path = os.path.join(args.output, file) cnt[file] = line_counter(file_path) sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True) name_map = dict() for idx, (file_name, times) in enumerate(sorted_cnt): origin_path = os.path.join(args.output, file_name) new_name = id_name.format(idx) new_path = os.path.join(args.output, new_name) os.rename(origin_path, new_path) name_map[file_name] = new_name for k, v in p_bucket.items(): p_bucket[k] = list(map(lambda x: name_map[x], v)) #合并文件 output_file = os.path.join(args.output, 'all_cluster.txt') try: if os.path.isfile(output_file): os.unlink(output_file) except Exception as e: print(e) file_list = os.listdir(args.output) fw = open(output_file, 'w+') for file in file_list: with open(os.path.join(args.output, file)) as f: for line in f.readlines(): fw.write(str(int(file)) + ',' + line) fw.close() df = pd.read_csv(output_file, names=['id', 'text']) df.columns = ['cluster_id', 'ques'] print('All is well') # json.dumps(dict(ques=ques)) df_dict = df.set_index('cluster_id').T.to_dict('records')[0] #dataframe 的数据格式转换 #df 0 aa # 0 aaa => aa [aaa] # 1 bb bb [] #df_dict = {0: aa, 1: bb} print(df_dict) result_dict = {} for cluster_id, ques in df_dict.items(): li = df[df['cluster_id'] == cluster_id].ques.values.tolist() # if(ques in li): li.remove(ques) result_dict[ques] = li my_list = [result_dict] my_df = pd.DataFrame(my_list).T my_df = my_df.reset_index() my_df.columns = ['ques', 'info'] print(my_df) return my_df.to_json(orient="records", force_ascii=False)
def main(argv=None): config = SafeConfigParser() config.read(cmd_args.config_path) if cmd_args.restore_checkpoint: print('Skipping training phase, loading model checkpoint from: ', config.get('main', 'checkpoint_path')) # Get the data. train_data_filename = utils.maybe_download( config, config.get('data', 'train_data_filename')) train_labels_filename = utils.maybe_download( config, config.get('data', 'train_labels_filename')) test_data_filename = utils.maybe_download( config, config.get('data', 'test_data_filename')) test_labels_filename = utils.maybe_download( config, config.get('data', 'test_labels_filename')) # Extract it into np arrays. train_data = utils.extract_data(config, train_data_filename, 60000) train_labels = utils.extract_labels(train_labels_filename, 60000) test_data = utils.extract_data(config, test_data_filename, 10000) test_labels = utils.extract_labels(test_labels_filename, 10000) validation_size = config.getint('main', 'validation_size') num_epochs = config.getint('main', 'num_epochs') # Generate a validation set. validation_data = train_data[:validation_size, ...] validation_labels = train_labels[:validation_size] train_data = train_data[validation_size:, ...] train_labels = train_labels[validation_size:] num_epochs = num_epochs train_size = train_labels.shape[0] lenet5 = LeNet5(config) x, y_ = lenet5.train_input_placeholders() y_conv, logits, keep_prob, param_dict = lenet5.model(x) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y_)) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(param_dict['fc1_W']) + tf.nn.l2_loss(param_dict['fc1_b']) + tf.nn.l2_loss(param_dict['fc2_W']) + tf.nn.l2_loss(param_dict['fc2_b'])) # Add the regularization term to the loss. loss += 5e-4 * regularizers # Optimizer: set up a variable that's incremented once # per batch and controls the learning rate decay. batch = tf.Variable(0, dtype=tf.float32) # Decay once per epoch, using an exponential schedule starting at 0.01. learning_rate = tf.train.exponential_decay( 0.01, batch * config.getint('main', 'batch_size'), train_size, 0.95, staircase=True) optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9) \ .minimize(loss, global_step=batch) input_dict = { "x": x, "y_": y_, "y_conv": y_conv, "keep_prob": keep_prob, "train_data": train_data, "train_labels": train_labels, "test_data": test_data, "test_labels": test_labels, "validation_data": validation_data, "validation_labels": validation_labels, "num_epochs": num_epochs, "train_size": train_size } saver = tf.train.Saver(tf.all_variables()) evaluator = Evaluator(cmd_args, config, optimizer, learning_rate, loss, saver) evaluator.run(input_dict) fastgradientsign_advgen = FastGradientSign_AdvGen(cmd_args, [1, 28, 28, 1], saver, config) adv_out_df = fastgradientsign_advgen.run(input_dict) pkl_path = config.get('main', 'pickle_filepath') utils.ensure_dir(os.path.dirname(pkl_path)) with open(pkl_path, "wb") as pkl: pickle.dump(adv_out_df, pkl)
def __init__(self, model, loss, metrics, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) self.model = model self.loss = loss self.metrics = metrics self.name = config['name'] self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.summaryWriter = SummaryWriter() if tf.test.is_gpu_available(): if config['cuda']: self.with_cuda = True self.gpus = { i: item for i, item in enumerate(self.config['gpus']) } device = 'cuda' else: self.with_cuda = False device = 'cpu' else: self.logger.warning( 'Warning: There\'s no CUDA support on this machine, training is performed on CPU.' ) self.with_cuda = False device = 'cpu' self.device = tf.device(device) self.model.to(self.device) self.logger.debug('Model is initialized.') self._log_memory_usage() self.train_logger = train_logger self.optimizer = self.model.optimize(config['optimizer_type'], config['optimizer']) self.lr_scheduler = getattr(keras.callbacks.LearningRateScheduler, config['lr_scheduler_type'], None) if self.lr_scheduler: self.lr_scheduler = self.lr_scheduler(self.optimizer, **config['lr_scheduler']) self.lr_scheduler_freq = config['lr_scheduler_freq'] self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode == 'min' or self.monitor_mode == 'max' self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], self.name) ensure_dir(self.checkpoint_dir) json.dump(config, open(os.path.join(self.checkpoint_dir, 'config.json'), 'w'), indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def run(self, input_dict): x = input_dict["x"] y_ = input_dict["y_"] y_conv = input_dict["y_conv"] keep_prob = input_dict["keep_prob"] train_data = input_dict["train_data"] train_labels = input_dict["train_labels"] test_data = input_dict["test_data"] test_labels = input_dict["test_labels"] validation_data = input_dict["validation_data"] validation_labels = input_dict["validation_labels"] num_epochs = input_dict["num_epochs"] train_size = input_dict["train_size"] batch_size = self.config.getint('main', 'batch_size') checkpoint_path = self.config.get('main', 'checkpoint_path') num_classes = self.config.getint('main', 'num_classes') eval_frequency = self.config.getint('main', 'eval_frequency') utils.ensure_dir(os.path.dirname(checkpoint_path)) start_time = time.time() with tf.Session() as sess: tf.initialize_all_variables().run() print('Initialized!') if not self.cmd_args.restore_checkpoint: print('No checkpoint to load, training model from scratch...') if self.cmd_args.test: iter_range = xrange(1) else: iter_range = xrange(int(num_epochs * train_size) // batch_size) for step in iter_range: offset = (step * batch_size) % (train_size - batch_size) batch_data = train_data[offset:(offset + batch_size), ...] batch_labels = train_labels[offset:(offset + batch_size)] feed_dict = { x: batch_data, y_: batch_labels, keep_prob: 0.5 } _, l, lr, predictions = sess.run( [self.optimizer, self.loss, self.learning_rate, y_conv], feed_dict=feed_dict) if step % eval_frequency == 0: if not self.cmd_args.test: path = self.saver.save(sess, checkpoint_path) print("Saved model checkpoint to {}\n".format(path)) elapsed_time = time.time() - start_time start_time = time.time() print('Step %d (epoch %.2f), %.1f ms' % (step, float(step) * batch_size / train_size, 1000 * elapsed_time / eval_frequency)) print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr)) print('Minibatch error: %.1f%%' % utils.error_rate(predictions, batch_labels, self.onehot_labels)) print('Validation error: %.1f%%' % utils.error_rate( self.eval_in_batches(y_conv, x, keep_prob, validation_data, sess, batch_size, num_classes), validation_labels, self.onehot_labels)) sys.stdout.flush() # Finally print the result! test_error = utils.error_rate(self.eval_in_batches(y_conv, x, keep_prob, test_data, sess, batch_size, num_classes), test_labels, self.onehot_labels) print('Test error: %.1f%%' % test_error)
def main(trainingdir, model, num_epochs, size_batch_test, logdir, logdir_w, perform_one_hot, binarize_labels): global_step = tf.get_variable('global_step', dtype=tf.int32, initializer=0, trainable=False) train_list, valid_list, test_list = get_file_lists(trainingdir) label_input_size, label_output_size = get_tensor_size( perform_one_hot, binarize_labels) test_dataset = create_dataset(filenames=test_list, mode="testing", num_epochs=1, batch_size=size_batch_test, perform_one_hot=perform_one_hot, binarize_labels=binarize_labels) test_iterator = test_dataset.make_initializable_iterator() # Feedable iterator assigns each iterator a unique string handle it is going to work on handle = tf.placeholder(tf.string, shape=[]) iterator = tf.data.Iterator.from_string_handle(handle, test_dataset.output_types, test_dataset.output_shapes) x, _ = iterator.get_next() x.set_shape([None, 192, 192, 4]) x = tf.cast(x, tf.float32) training_placeholder = tf.placeholder(dtype=tf.bool, shape=[], name='training_placeholder') if model == "unet_keras": from models import unet_keras as model logits, logits_soft = model.unet(x, training_placeholder, label_output_size) elif model == "unet_tensorflow": from models import unet_tensorflow as model logits, logits_soft = model.unet(x, training=training_placeholder, norm_option=False, drop_val=0.5, label_output_size=label_output_size) ######################################## SUMMARIES ######################################################### tf.summary.image('input_0', tf.expand_dims(x[:, :, :, 0], axis=-1)) if label_output_size == 1: tf.summary.image('prediction', tf.expand_dims(logits_soft[:, :, :, 0], axis=-1)) elif label_output_size > 1: tf.summary.image("prediction", logits_soft[:, :, :, 1:]) summary_test = tf.summary.merge_all() # op to write logs to Tensorboard logdir_w = os.path.expanduser(logdir_w) utils.ensure_dir(logdir_w) writer = tf.summary.FileWriter(logdir_w, graph=tf.get_default_graph()) # Weight saver model_checkpoint_path = os.path.join(logdir, 'Checkpoint') saver = tf.train.Saver() ######################################## RUN SESSION ######################################################### with tf.Session() as sess: # Initialize Variables #restore_weights: saver.restore(sess, tf.train.latest_checkpoint(logdir)) test_handle = sess.run(test_iterator.string_handle()) sess.run(test_iterator.initializer) try: while True: summary_val, logits_test = sess.run( [summary_test, logits_soft], feed_dict={ handle: test_handle, training_placeholder: False }) writer.add_summary(summary_val) except tf.errors.OutOfRangeError: pass return
pretrained=True).to(device) # %% criterion = functools.partial(F.cross_entropy, reduction='mean') optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=0.001) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=15, verbose=True) # %% tracked_metrics = [ metrics.Accuracy(multiclass=True), ] name = f'resnet18-pneumonia-classifier-s{seed}-3-classes-unprocessed' utils.ensure_dir(f'logs/{vars.corda_version}/{name}') # %% best_model = trainer.fit(model=model, train_dataloader=train_dataloader, val_dataloader=val_dataloader, test_dataloader=test_dataloader, test_every=10, criterion=criterion, optimizer=optimizer, scheduler=lr_scheduler, metrics=tracked_metrics, n_epochs=epochs, name=name, metric_choice=metric, mode=mode,
def main(trainingdir, model, num_epochs, size_batch_train, size_batch_valid, step_metrics, steps_saver, learning_rate, logdir, restore_weights, perform_one_hot, binarize_labels): global_step=tf.get_variable('global_step',dtype=tf.int32,initializer=0,trainable=False) train_list, valid_list, _ = get_file_lists(trainingdir) label_input_size,label_output_size=get_tensor_size(perform_one_hot,binarize_labels) train_dataset = create_dataset(filenames=train_list,mode="training", num_epochs=1, batch_size=size_batch_train, perform_one_hot=perform_one_hot, binarize_labels=binarize_labels) train_iterator = train_dataset.make_initializable_iterator() validation_dataset = create_dataset(filenames=valid_list,mode="validation", num_epochs=1, batch_size=size_batch_valid, perform_one_hot=perform_one_hot, binarize_labels=binarize_labels) validation_iterator = validation_dataset.make_initializable_iterator() # Feedable iterator assigns each iterator a unique string handle it is going to work on handle = tf.placeholder(tf.string, shape = []) iterator = tf.data.Iterator.from_string_handle(handle, train_dataset.output_types, train_dataset.output_shapes) x, y = iterator.get_next() x.set_shape([None, 192, 192, 4]) x = tf.cast(x, tf.float32) training_placeholder = tf.placeholder(dtype=tf.bool, shape=[], name='training_placeholder') if model == "unet_keras": from models import unet_keras as model logits, logits_soft = model.unet(x,training_placeholder,label_output_size) elif model == "unet_tensorflow": from models import unet_tensorflow as model logits, logits_soft = model.unet(x, training=training_placeholder, norm_option=False,drop_val=0.5,label_output_size=label_output_size) y.set_shape([None, 192, 192, label_input_size]) y = tf.cast(y, tf.int32) if label_input_size>1: #OneHotEncoding loss_op= tf.reduce_mean(tf.losses.softmax_cross_entropy(onehot_labels=y, logits=logits)) #tf.losses.softmax_cross_entropy(onehot_labels=y, logits=logits) #loss_op = tf.losses.get_total_loss(name='loss_op') else: #labelEncoding if label_output_size==1: loss_op=tf.reduce_mean(tf.losses.sigmoid_cross_entropy(multi_class_labels=y, logits=logits)) else: loss_op= tf.reduce_mean(tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)) if label_input_size>1: #OneHotEncoding #Define the IoU metrics and update operations IoU_metrics, IoU_metrics_update = tf.metrics.mean_iou(labels=y, predictions=logits_soft, num_classes=label_input_size, name='my_metric_IoU') #Isolate the variables stored behind the scenes by the metric operation running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="my_metric_IoU") # Define initializer to initialize/reset running variables running_vars_initializer = tf.variables_initializer(var_list=running_vars) optimizer = tf.train.AdamOptimizer(learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group([update_ops, optimizer.minimize(loss_op,global_step=global_step)]) # Weight saver model_checkpoint_path = os.path.join(logdir, 'Checkpoint') saver = tf.train.Saver() ######################################## SUMMARIES ######################################################### tf.summary.image('input_0',tf.expand_dims(x[:,:,:,0],axis=-1)) if label_input_size==1: tf.summary.image("labels",tf.cast(y,tf.float32)) elif label_input_size>1: tf.summary.image('labels_0',tf.expand_dims(tf.cast(y,tf.float32)[:,:,:,0],axis=-1)) tf.summary.image('labels_1',tf.expand_dims(tf.cast(y,tf.float32)[:,:,:,1],axis=-1)) if label_input_size>2: tf.summary.image('labels_2',tf.expand_dims(tf.cast(y,tf.float32)[:,:,:,2],axis=-1)) tf.summary.image('labels_3',tf.expand_dims(tf.cast(y,tf.float32)[:,:,:,3],axis=-1)) if label_output_size==1: tf.summary.image('prediction',tf.expand_dims(logits_soft[:,:,:,0],axis=-1)) elif label_output_size>1: tf.summary.image("prediction", logits_soft[:,:,:,1:]) tf.summary.image('prediction_0',tf.expand_dims(logits_soft[:,:,:,0],axis=-1)) tf.summary.image('prediction_1',tf.expand_dims(logits_soft[:,:,:,1],axis=-1)) if label_output_size>2: tf.summary.image('prediction_2',tf.expand_dims(logits_soft[:,:,:,2],axis=-1)) tf.summary.image('prediction_3',tf.expand_dims(logits_soft[:,:,:,3],axis=-1)) #tf.summary.histogram("logits",logits) tf.summary.scalar("loss", loss_op) tf.summary.histogram("logits_soft",logits_soft) tf.summary.histogram("logits",logits) summary_op=tf.summary.merge_all() # op to write logs to Tensorboard logdir = os.path.expanduser(logdir) utils.ensure_dir(logdir) writer = tf.summary.FileWriter(logdir, graph=tf.get_default_graph()) writer_val =tf.summary.FileWriter(os.path.join(logdir, 'validation loss'), graph=tf.get_default_graph()) ######################################## RUN SESSION ######################################################### with tf.Session() as sess: # Initialize Variables if restore_weights: saver.restore(sess, tf.train.latest_checkpoint(logdir)) else: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) train_handle = sess.run(train_iterator.string_handle()) validation_handle = sess.run(validation_iterator.string_handle()) #training, validation and saving for epoch in range(num_epochs): sess.run(train_iterator.initializer) step=0 try: while True: #train _,cost,summary_val,step_gl,logits_val,_ = sess.run([train_op,loss_op,summary_op,global_step,logits,logits_soft], feed_dict={handle: train_handle,training_placeholder: True}) writer.add_summary(summary_val,step_gl) step += 1 print('\n Training step: Epoch {}, batch {} -- Loss: {:.3f}'.format(epoch+1, step, cost)) #validation if step % step_metrics == 0: total_validation_loss = [] #list where we will store the loss at each batch sess.run(validation_iterator.initializer) step_val=0 # initialize/reset the running variables of the IoU metrics if label_input_size>1: #OneHotEncoding sess.run(running_vars_initializer) try: print('\nPerforming validation') while True: if label_input_size>1: #OneHotEncoding cost_valid, _ = sess.run([loss_op, IoU_metrics_update], feed_dict={handle: validation_handle,training_placeholder: False}) else: cost_valid = sess.run([loss_op], feed_dict={handle: validation_handle,training_placeholder: False}) total_validation_loss.append(cost_valid) step_val += 1 #print('\nValidation step: Epoch {}, batch {} -- Loss: {:.3f}'.format(epoch+1, step_val, cost_valid)) except tf.errors.OutOfRangeError: pass #loss total_validation_loss = np.mean(total_validation_loss) validation_loss_summary = tf.Summary(value=[tf.Summary.Value(tag="loss", simple_value=total_validation_loss)]) writer_val.add_summary(validation_loss_summary,step_gl) #IoU metrics if label_input_size>1: #OneHotEncoding #IoU metrics IoU_score = sess.run(IoU_metrics) IoU_summary = tf.Summary(value=[tf.Summary.Value(tag="IoU_metrics", simple_value=IoU_score)]) writer.add_summary(IoU_summary,step_gl) print('\n Epoch {} and training batch {} -- Validation loss {:.3f} and IoU metrics {:.3f}'.format(epoch+1, step,total_validation_loss, IoU_score)) else: print('\n Epoch {} and training batch {} -- Validation loss {:.3f}'.format(epoch+1, step,total_validation_loss)) #saving if step % steps_saver == 0: print('\n Step {} Saving weights to {}'.format(step+1, model_checkpoint_path)) saver.save(sess, save_path=model_checkpoint_path,global_step=global_step) except tf.errors.OutOfRangeError: pass return