def download(left, right, top, bottom, zoom, filename, maptype="default"): for x in trange(left, right + 1): for y in trange(top, bottom + 1): path = './tiles/%s/%i/%i/%i.png' % (filename, zoom, x, y) if not os.path.exists(path): _download(x, y, zoom,filename,maptype)
def main(): """Main program.""" answer = 0 start = time.time() max_period = 0 for index in tqdm.trange(1, 1000): period = calculate_period_length(index) if period > max_period: max_period = period answer = index end = time.time() print("The answer is %d" % answer) print("%f seconds elapsed" % (end - start)) start = time.time() max_period = 0 for index in tqdm.trange(1, 1000): period = lambda_decimal_period(index) if period > max_period: max_period = period answer = index end = time.time() print("The answer is %d" % answer) print("%f seconds elapsed" % (end - start)) import pyperclip pyperclip.copy(str(answer)) print("The answer has been placed in the clipboard.")
def trees_are_random(filename): res_file = filename.replace('_0', '_processed') with np.load(filename) as f: res, gold = f['res'], f['gold'] num_trees, num_order, _ = res.shape all_trees = list(range(num_trees)) all_order = list(range(num_order)) all_pred = np.arange(len(gold)) nrep = 100 rate = np.zeros((num_trees//2+1, [j//13-1 for j in range(13, num_order, 13)][-1]+1, 2)) frac = 1.3 dropped = [] for i in trange(1, num_trees+1, 2): for j in trange(13, num_order, 13): tmp_res = [] for k in range(nrep): trees = random.sample(all_trees, i) orders = random.sample(all_order, j-1 if j % 2 == 0 else j) if i == 1: vals = res[trees, orders, :] else: vals = res[np.ix_(trees, orders, all_pred)].sum(0) tmp_res.append(mistakes(vals)) thre = frac*np.median(tmp_res) good = tmp_res < thre bad = np.logical_not(good) dropped.append((i, j, bad.sum()/nrep)) rate[(i-1)//2, j//13-1, :] = np.mean(tmp_res), np.std(tmp_res) np.savez_compressed(res_file, rate=rate)
def main(): bar = trange(60*25) bar.write("Working time...") for t in bar: time.sleep(1) bar = trange(60*5) bar.write("Break time...") for t in bar: time.sleep(1)
def process_tilenum(left, right, top, bottom, zoom, output='output/mosaic.png'): """ download and mosaic by tile number """ for x in trange(left, right + 1): for y in trange(top, bottom + 1): path = './tiles/%i/%i/%i.png' % (zoom, x, y) if not os.path.exists(path): _download(x, y, zoom) _mosaic(left, right, top, bottom, zoom, output)
def test_trange(): """ Test trange """ with closing(StringIO()) as our_file: for _ in trange(3, file=our_file, leave=True): pass our_file.seek(0) assert '| 3/3 ' in our_file.read() with closing(StringIO()) as our_file2: for _ in trange(3, file=our_file2, leave=False): pass our_file2.seek(0) assert '| 3/3 ' not in our_file2.read()
def ensemble(validation_base_path, validation_folder, validation_predicted_folder): pkl_files = [] weights = [] #weight_file = './file_weight_128.csv' #weight_file = './file_weight_128_144models.csv' #weight_file = './file_weight_128_144_updated models.csv' #weight_file = './file_weight_128_144_3rd version models.csv' weight_file = './10 best models weights for task 1.csv' #weight_file = './5 best models weights for task 1.csv' #weight_file = './20 best models weights for task 1.csv' with open(weight_file, 'rb') as f: rows = csv.reader(f, delimiter=',') #next(rows, None) for row in rows: if '.pkl' in row[0]: pkl_files.append(validation_base_path + row[0]) else: pkl_files.append(validation_base_path + row[0] + '.pkl') weights.append(float(row[1])) print (len(pkl_files)) print weights print np.sum(weights) mask_pred_challenge_list = [] for i in trange(len(pkl_files)): mask_pred_challenge = pkl.load(open(pkl_files[i], 'rb')) mask_pred_challenge_list.append(mask_pred_challenge) mask_pred_challenge_list = np.array(mask_pred_challenge_list) print mask_pred_challenge_list.shape weights = np.array(weights) mask_pred_challenge = np.dot(mask_pred_challenge_list.transpose(1,2,3,0), weights) print mask_pred_challenge.shape if not os.path.exists(validation_predicted_folder): os.makedirs(validation_predicted_folder) cutoff = 0.5 mask_pred_challenge_b = (np.where(mask_pred_challenge>=cutoff, 1, 0) * 255).astype(np.uint8) challenge_list = ISIC.list_from_folder(validation_folder) for i in trange(len(challenge_list)): _, _ = ISIC.show_images_full_sized(challenge_list, img_mask_pred_array=mask_pred_challenge_b, image_folder=validation_folder, mask_folder=None, index=i, output_folder=validation_predicted_folder, plot=False)
def test_trange(): our_file = StringIO() for i in trange(3, file=our_file, leave=True): pass our_file.seek(0) assert '| 3/3 ' in our_file.read() our_file.close() our_file2 = StringIO() for i in trange(3, file=our_file2, leave=False): pass our_file2.seek(0) assert '| 3/3 ' not in our_file2.read() our_file2.close()
def _mosaic(left, right, top, bottom, zoom, output='output/mosaic.png'): size_x = (right - left + 1) * 256 size_y = (bottom - top + 1) * 256 output_im = Image.new("RGBA", (size_x, size_y)) for x in trange(left, right + 1): for y in trange(top, bottom + 1): path = './tiles/%i/%i/%i.png' % (zoom, x, y) target_im = Image.open(path) output_im.paste(target_im, (256 * (x - left), 256 * (y - top))) output_path = os.path.split(output) if len(output_path) > 1 and len(output_path) != 0: if not os.path.isdir(output_path[0]): os.makedirs(output_path[0]) output_im.save(output)
def move_and_snap(m, s, fname, zenith = 0, destination = 0, acc_len = 1, n_accs = 10, dt = 0): ''' Function that gets called over a range of airmasses in go(). Moves to the destination, takes a snapshot, calculates true time based on offset, queries the motor position, then calls io.write_to_hdf5 on the hdf5 filename. Inputs: Required - Motor, Spec, and hdf5 filename. Optional - zenith angle wrt. 0 on the motor (degs), destination wrt. zenith (degs), accumulation length in secs, step size in degrees, number of accumulations, computer's offset from true utc time in secs. Outputs: None, writes to disk. ''' #print('Moving to {} deg ZA'.format(destination)) m.abst(destination + zenith) #print('Integrating') for i in tqdm.trange(n_accs, unit='accs'): spec = s.snap_spec() utc = ts.true_time(dt) pos = m.position() mjd = ts.iso_to_mjd(utc) io.write_to_hdf5(fname, spec, { 'angle_degs': pos, 'utc': utc, 'mjd': mjd, 'samp_rate_mhz': s.samp_rate, 'acc_len_secs': s.acc_len, 'zenith_degs': zenith })
def main(): """Main program.""" answer = 0 start_time = time.time() # Find next sequence after 1487, 4817, 8147 for number_1 in tqdm.trange(1488, 9998): for index in range(1, (9999 - number_1) / 2): number_2 = number_1 + index number_3 = number_1 + (2 * index) if all([sorted(str(n)) == sorted(str(number_1)) \ for n in [number_2, number_3]]) \ and all([sympy.ntheory.primetest.isprime(n) \ for n in [number_1, number_2, number_3]]): answer = int(str(number_1) + str(number_2) + str(number_3)) break if answer > 0: break end_time = time.time() print("The answer is %d" % answer) print("%f seconds elapsed." % (end_time - start_time)) import pyperclip pyperclip.copy(str(answer)) print("The answer has been placed in the clipboard.")
def simulate_system(bundle, reps=10, check_laplacian=True): """ Generate data from system setup """ # lonely investigation :'( if check_laplacian: investigate_laplacian(bundle.graph) # solve system on network corr_mats = [] var_sers = [] all_sols = [] for _ in trange(reps): sols, ts = solve_system(bundle.system_config) cmat = compute_correlation_matrix(sols) vser = compute_cluster_num(sols, len(bundle.graph.nodes())) corr_mats.append(cmat) var_sers.append(vser) all_sols.append(sols) bundle['all_sols'] = all_sols bundle['corr_mats'] = np.array(corr_mats) bundle['var_sers'] = np.array(var_sers) bundle['ts'] = ts return bundle
def topography(df): xlim = int(np.floor(df.xcen.max()) + 1) ylim = int(np.floor(df.ycen.max()) + 1) # Take z in reverse order, so that when we iterate through them, the # lowest z value (the top of the topography) will come up first. print("Computing topography", end='\r') sys.stdout.flush() df_iter = df.sort(['ycen', 'xcen', 'zcen']).itertuples() xcen = df.columns.get_loc("xcen") + 1 ycen = df.columns.get_loc("ycen") + 1 zcen = df.columns.get_loc("zcen") + 1 row = None topo = np.zeros((ylim, xlim), dtype=np.int32) for y in trange(ylim, desc="Computing topography", leave=True): for x in range(xlim): def same_pixel(row): """Use tuple ordering to determine whether we're ahead or behind the dataframe.""" image_index = (y,x) df_index = (row[ycen], row[xcen]) return df_index < image_index # Drop until x,y coordinates match. Since this is sorted by # x,y,z, the next pixel will be the lowest z. row = dropwhile(same_pixel, df_iter, row) if row[xcen] == x and row[ycen] == y: topo[y,x] = row[zcen] else: raise RuntimeError("No data at coordinate x={},y={}" .format(x,y)) return topo
def get_training_bbox(bbox_dir, imglist): import xml.etree.ElementTree as ET ret = [] def parse_bbox(fname): root = ET.parse(fname).getroot() size = root.find('size').getchildren() size = map(int, [size[0].text, size[1].text]) box = root.find('object').find('bndbox').getchildren() box = map(lambda x: float(x.text), box) return np.asarray(box, dtype='float32') with timed_operation('Loading Bounding Boxes ...'): cnt = 0 for k in tqdm.trange(len(imglist)): fname = imglist[k][0] fname = fname[:-4] + 'xml' fname = os.path.join(bbox_dir, fname) try: ret.append(parse_bbox(fname)) cnt += 1 except Exception: ret.append(None) logger.info("{}/{} images have bounding box.".format(cnt, len(imglist))) return ret
def test_iter_overhead_hard(): """Test overhead of iteration based tqdm (hard)""" total = int(1e5) with closing(MockIO()) as our_file: a = 0 with trange(total, file=our_file, leave=True, miniters=1, mininterval=0, maxinterval=0) as t: with relative_timer() as time_tqdm: for i in t: a += i assert a == (total * total - total) / 2.0 a = 0 with relative_timer() as time_bench: for i in _range(total): a += i our_file.write(("%i" % a) * 40) # Compute relative overhead of tqdm against native range() try: assert time_tqdm() < 60 * time_bench() except AssertionError: raise AssertionError('trange(%g): %f, range(%g): %f' % (total, time_tqdm(), total, time_bench()))
def main(): """Main program.""" answer = 0 start_time = time.time() denominator = Fraction(1, 1) for index in tqdm.trange(1000): if index == 0: denominator = Fraction(2, 1) elif index == 1: denominator = 2 + Fraction(1, 2) else: denominator = 2 + Fraction(1, denominator) continual_fraction = 1 + Fraction(1, denominator) numerator_digits = len(str(continual_fraction.numerator)) denominator_digits = len(str(continual_fraction.denominator)) if numerator_digits > denominator_digits: answer += 1 end_time = time.time() print("The answer is %d" % answer) print("%f seconds elapsed." % (end_time - start_time)) import pyperclip pyperclip.copy(str(answer)) print("The answer has been placed in the clipboard.")
def test_iter_overhead_simplebar_hard(): """Test overhead of iteration based tqdm vs simple progress bar (hard)""" total = int(1e4) with closing(MockIO()) as our_file: a = 0 with trange(total, file=our_file, leave=True, miniters=1, mininterval=0, maxinterval=0) as t: with relative_timer() as time_tqdm: for i in t: a += i assert a == (total * total - total) / 2.0 a = 0 s = simple_progress(_range(total), file=our_file, leave=True, miniters=1, mininterval=0) with relative_timer() as time_bench: for i in s: a += i # Compute relative overhead of tqdm against native range() try: assert time_tqdm() < 2.5 * time_bench() except AssertionError: raise AssertionError('trange(%g): %f, simple_progress(%g): %f' % (total, time_tqdm(), total, time_bench()))
def featurize_time_series_submission(submission_df, features, structure): print("Featurizing time series") print(features.shape[1]) assignments = structure['ASS_ASSIGNMENT'] days = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche'] ass_dfs = {} for day in days: for ass in assignments: ass_dfs[day + '_' + ass] = pd.read_pickle('files/split/' + day + '_' + ass + '.pkl') new_features = np.full((len(submission_df), _n_features), np.nan) submission_df = submission_df.set_index(['DAY_WE_DS', 'DATE', 'ASS_ASSIGNMENT'], drop=False) for i in trange(0, submission_df.shape[0]): (day, datetime, ass) = submission_df.index[i] df = ass_dfs[day + '_' + ass][(ass_dfs[day + '_' + ass].DATE.dt.hour == datetime.hour) & (ass_dfs[day + '_' + ass].DATE.dt.minute == datetime.minute)] df = df[df.DATE < datetime - DateOffset(days=3)] old_values = df.tail(_n_features)['CSPL_RECEIVED_CALLS'].as_matrix() # print(old_values) for j in range(len(old_values)): new_features[i, j] = old_values[j] for j in range(_n_features): features['prev_value_' + str(j)] = new_features[:, j] return features
def _maybe_generate_and_save(self, except_list=[]): self.data = {} for name, num in self.data_num.items(): if name in except_list: tf.logging.info("Skip creating {} because of given except_list {}".format(name, except_list)) continue path = self.get_path(name) if not os.path.exists(path): tf.logging.info("Creating {} for [{}]".format(path, self.task)) x = np.zeros([num, self.max_length, 2], dtype=np.float32) y = np.zeros([num, self.max_length], dtype=np.int32) for idx in trange(num, desc="Create {} data".format(name)): n_nodes = self.rng.randint(self.min_length, self.max_length+ 1) nodes, res = generate_one_example(n_nodes, self.rng) x[idx,:len(nodes)] = nodes y[idx,:len(res)] = res np.savez(path, x=x, y=y) self.data[name] = TSP(x=x, y=y, name=name) else: tf.logging.info("Skip creating {} for [{}]".format(path, self.task)) tmp = np.load(path) self.data[name] = TSP(x=tmp['x'], y=tmp['y'], name=name)
def spaceConvNumba2(self): """ Exactly the same as the former method, just contains a nested function so the dot product appears more obvious """ @checkarrays @jit def dotJit(subarray, kernel): """ perform a simple 'dot product' between the 2 dimensional image subsets. """ total = 0.0 # This is the O(n^2) part of the algorithm for i in xrange(subarray.shape[0]): for j in xrange(subarray.shape[1]): total += subarray[i][j] * kernel[i][j] return total # this is the O(N^2) part of the algorithm for i in trange(self.__rangeX_): for j in xrange(self.__rangeY_): # dotJit is located outside the class :P self.__arr_[i, j] = dotJit(\ self.array[i:i+self.__rangeKX_, j:j+self.__rangeKY_] , self.kernel ) return self.__arr_
def test_iter_overhead(): """ Test overhead of iteration based tqdm """ try: assert checkCpuTime() except: raise SkipTest total = int(1e6) with closing(MockIO()) as our_file: a = 0 with relative_timer() as time_tqdm: for i in trange(total, file=our_file): a += i assert(a == (total * total - total) / 2.0) a = 0 with relative_timer() as time_bench: for i in _range(total): a += i our_file.write(a) # Compute relative overhead of tqdm against native range() if time_tqdm() > 9 * time_bench(): raise AssertionError('trange(%g): %f, range(%g): %f' % (total, time_tqdm(), total, time_bench()))
def send_dataflow_zmq(df, addr, hwm=50, format=None, bind=False): """ Run DataFlow and send data to a ZMQ socket addr. It will serialize and send each datapoint to this address with a PUSH socket. This function never returns. Args: df (DataFlow): Will infinitely loop over the DataFlow. addr: a ZMQ socket endpoint. hwm (int): ZMQ high-water mark (buffer size) format (str): The serialization format. Default format uses :mod:`tensorpack.utils.serialize`. This format works with :class:`dataflow.RemoteDataZMQ`. An alternate format is 'zmq_ops', used by https://github.com/tensorpack/zmq_ops and :class:`input_source.ZMQInput`. bind (bool): whether to bind or connect to the endpoint address. """ assert format in [None, 'zmq_op', 'zmq_ops'] if format is None: dump_fn = dumps else: from zmq_ops import dump_arrays dump_fn = dump_arrays ctx = zmq.Context() socket = ctx.socket(zmq.PUSH) socket.set_hwm(hwm) if bind: socket.bind(addr) else: socket.connect(addr) try: df.reset_state() logger.info("Serving data to {} with {} format ...".format( addr, 'default' if format is None else 'zmq_ops')) INTERVAL = 200 q = deque(maxlen=INTERVAL) try: total = df.size() except NotImplementedError: total = 0 tqdm_args = get_tqdm_kwargs(leave=True, smoothing=0.8) tqdm_args['bar_format'] = tqdm_args['bar_format'] + "{postfix}" while True: with tqdm.trange(total, **tqdm_args) as pbar: for dp in df.get_data(): start = time.time() socket.send(dump_fn(dp), copy=False) q.append(time.time() - start) pbar.update(1) if pbar.n % INTERVAL == 0: avg = "{:.3f}".format(sum(q) / len(q)) pbar.set_postfix({'AvgSendLat': avg}) finally: logger.info("Exiting send_dataflow_zmq ...") socket.setsockopt(zmq.LINGER, 0) socket.close() if not ctx.closed: ctx.destroy(0)
def train(self): max_epoch = int(math.ceil(1. * self.max_iter / len(self.train_loader))) # 117 for epoch in tqdm.trange(self.epoch, max_epoch, desc='Train', ncols=80): self.epoch = epoch self.train_epoch() if self.iteration >= self.max_iter: break
def moran_process(N=1000,turns=10000,mean_site_muts=1,mean_rec_muts=1,init=sample_species,mutate=mutate, fitness=fitness,pop=None,print_modulus=100,hist_modulus=10): #ringer = (np.array([1]+[0]*(K-1)),sample_eps()) if pop is None: pop = [(lambda spec:(spec,fitness(spec)))(init()) for _ in trange(N)] # ringer = make_ringer() # pop[0] = (ringer,fitness(ringer)) #pop = [(ringer,fitness(ringer)) for _ in xrange(N)] site_mu = min(1/float(n*L) * mean_site_muts,1) rec_mu = min(1/float(K) * mean_rec_muts,1) hist = [] for turn in xrange(turns): fits = [f for (s,f) in pop] #print fits birth_idx = inverse_cdf_sample(range(N),fits,normalized=False) if birth_idx is None: return pop death_idx = random.randrange(N) #print birth_idx,death_idx mother,f = pop[birth_idx] daughter = mutate(mother,site_mu,rec_mu) #print "mutated" pop[death_idx] = (daughter,fitness(daughter)) mean_fits = mean(fits) #hist.append((f,mean_fits)) if turn % hist_modulus == 0: mean_dna_ic = mean([motif_ic(sites,correct=False) for ((sites,eps),_) in pop]) mean_rec = mean([recognizer_promiscuity(x) for (x,f) in pop]) mean_recced = mean([sites_recognized((dna,rec)) for ((dna,rec),_) in pop]) hist.append((turn,f,mean_fits,mean_dna_ic,mean_rec,mean_recced)) if turn % print_modulus == 0: print turn,"sel_fit:",f,"mean_fit:",mean_fits,"mean_dna_ic:",mean_dna_ic,"mean_rec_prom:",mean_rec return pop,hist
def do(syst, ax): # data single_run_matrices = [] for _ in trange(reps): sol = solve_system(syst) sol_extract = sol.T[int(len(sol.T)*3/4):] single_run_mat = compute_correlation_matrix(np.array([sol_extract])) if single_run_mat.shape == (4, 4): single_run_mat = single_run_mat[:-1,:-1] assert single_run_mat.shape == (3, 3) single_run_matrices.append(single_run_mat) single_run_matrices = np.asarray(single_run_matrices) # plotting cols = cycle(['b', 'r', 'g', 'c', 'm', 'y', 'k']) for i, row in enumerate(single_run_matrices.T): for j, series in enumerate(row): if i == j: break plot_histogram( series[series!=1], ax, label=r'$c_{{{},{}}}$'.format(i,j), facecolor=next(cols), alpha=0.5, bins=100)
def __init__(self, cache=None, **kwargs): super(GTZAN, self).__init__(**kwargs) if kwargs.get('conf') is not None: conf = kwargs['conf'] cache = conf.get('cache', None) data_set_path = osp.join(DEFAULT_IMAGEST_BASE, self.data_set) self.data_set_path = data_set_path self.cache = cache X, y = parse_anno_file(data_set_path) if cache == 'raw': import librosa from tqdm import trange X_new = np.zeros((len(X), 1, 661500, 1)) for i in trange(len(X)): x,_ = librosa.load(osp.join(DEFAULT_DATA_BASE, X[i])) x_len = min(661500, len(x)) X_new[i,:,:x_len,0] = x[:x_len] if cache is not None and cache != 'raw': X = self.load_cache_X(X, cache) if cache == 'mfcc': X_new = np.zeros((len(X), X[0].shape[0], 1280, 1)) for i, x in enumerate(X): x_len = min(x.shape[1], 1280) X_new[i,:,:x_len,0] = x[:,:x_len] X = X_new # layout_X if self.layout_x == 'rel_path': self.X = X else: self.X = self.init_layout_X(X) # layout_y self.y = self.init_layout_y(y)
def stress(minutes): """Perform a CPU and memory stress test for the given `minutes`. The CPU stress test uses one thread per core, and the RAM stress test one thread per core, totalling all main memory available to user processes. Return a boolean indicating whether the stress test was successful. """ with open('/proc/cpuinfo') as cpuinfo: ncores = len(re.findall(r'^processor\b', cpuinfo.read(), re.M)) with open('/proc/meminfo') as meminfo: match = re.search(r'^MemAvailable:\s*([0-9]+) kB.*', meminfo.read(), re.M) mem_kib = int(match.group(1)) # Exclude a percentage of available memory for the stress processes themselves. mem_worker_kib = (mem_kib / ncores) * 90 / 100 proc = subprocess.Popen([ "stress", "-c", str(ncores), "-m", str(ncores), "--vm-bytes", "%dK" % mem_worker_kib, "-t", "%dm" % minutes], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) for _ in tqdm.trange(minutes * 60): # update progress bar every second time.sleep(1) proc.communicate() # wait for process, consume output return proc.returncode == 0
def launch_experiments(ag_results, mode, nbNodes, nbRuns, proba_edge): folderName = str(nb)+mode os.makedirs(folderName, exist_ok=True) os.chdir(folderName) print("Experiment in mode ", mode, " with ", nbNodes, " nodes ", end='') if nbRuns > 0: print("with ", nbRuns, " runs") for i in trange(nbRuns): subprocess.run(os.path.join(os.path.dirname(sys.path[0]), programPath) + " " + mode + " " + str(nbNodes) + " " + str(proba_edge) + " 2>> errors.txt", stdout=subprocess.DEVNULL, shell=True) #subprocess.run(programPath + " " + mode + " " + str(nbNodes), check=True, stdout=subprocess.DEVNULL, shell=True) else: print("without runs: reusing results from previous invocation") # TODO: rather do that as a 3rd phase? So that it's possible to relaunch experiments and have them included results=[] files= glob.glob("complex_graph*.csv") print("Collating results") for f in tqdm(files): #data = np.genfromtxt(f, delimiter="\t", encoding=None, dtype=[('Quality', '<i8'), ('Budget', '<i8'), ('ExpectRemainingTime', '<i8'), ('Deadline', '<i8'), ('NbNodes', '<i8'), ('ExecutionTime', '<i8'), ('ChoosingDuration', '<i8'), ('CallbackFlags', 'S16')], names=True) data = np.genfromtxt(f, delimiter="\t", encoding=None, names=True, skip_header=1, dtype=[('Quality', '<f8'), ('Budget', '<f8'), ('ExpectRemainingTime', '<f8'), ('Deadline', '<f8'), ('NbDegradedNodes', '<f8'), ('NbResamplers', '<f8'), ('ExecutionTime', '<f8'), ('ChoosingDuration', '<f8'), ('CallbackFlags', '<U7')]) #data = np.genfromtxt(f, delimiter="\t", encoding=None, dtype=None, names=True, skip_header=1) nbActualNodes=-1 nbActualEdges=-1 with open(f, "r") as datafile: line1 = datafile.readline().split(' ') nbActualNodes = int(line1[0]) #Always equal to nbNodes byconstruction of the random graph nbActualEdges = int(line1[1]) nbCycles = data.size #data should be 1D (each element is a dictionary) degraded = nbCycles -np.count_nonzero(data["Quality"]) ag_results[(nbNodes, mode)].append((data, nbActualEdges, nbCycles, degraded)) os.chdir("..")
def adev_at_tau_wrapper(idxs): if idxs[0] == 0: for i in trange(len(idxs)): adev_at_tau(idxs[i]) else: for i in idxs: adev_at_tau(i)
def loop(model: Layer, images: List[Tensor], labels: List[Tensor], loss: Loss, optimizer: Optimizer = None) -> None: correct = 0 # Track number of correct predictions. total_loss = 0.0 # Track total loss. with tqdm.trange(len(images)) as t: for i in t: predicted = model.forward(images[i]) # Predict. if argmax(predicted) == argmax(labels[i]): # Check for correct += 1 # correctness. total_loss += loss.loss(predicted, labels[i]) # Compute loss. # If we're training, backpropagate gradient and update weights. if optimizer is not None: gradient = loss.gradient(predicted, labels[i]) model.backward(gradient) optimizer.step(model) # And update our metrics in the progress bar. avg_loss = total_loss / (i + 1) acc = correct / (i + 1) t.set_description(f"mnist loss: {avg_loss:.3f} acc: {acc:.3f}")
def main(): file_dir = "raw_data/by_class" train_data = {'users': [], 'user_data': {}, 'num_samples': []} test_data = {'users': [], 'user_data': {}, 'num_samples': []} train_path = "train/mytrain.json" test_path = "test/mytest.json" X = [[] for _ in range(NUM_USER)] y = [[] for _ in range(NUM_USER)] nist_data = {} for class_ in os.listdir(file_dir): real_class = relabel_class(class_) if real_class >= 36 and real_class <= 61: full_img_path = file_dir + "/" + class_ + "/train_" + class_ all_files_this_class = os.listdir(full_img_path) random.shuffle(all_files_this_class) sampled_files_this_class = all_files_this_class[:7000] imgs = [] for img in sampled_files_this_class: imgs.append(load_image(full_img_path + "/" + img)) class_ = relabel_class(class_) print(class_) nist_data[class_ - 36] = imgs # a list of list, key is (0, 25) print(len(imgs)) # assign samples to users by power law num_samples = np.random.lognormal(4, 2, (NUM_USER)) + 5 idx = np.zeros(26, dtype=np.int64) for user in range(NUM_USER): num_sample_per_class = int(num_samples[user] / CLASS_PER_USER) if num_sample_per_class < 2: num_sample_per_class = 2 for j in range(CLASS_PER_USER): class_id = (user + j) % 26 if idx[class_id] + num_sample_per_class < len(nist_data[class_id]): idx[class_id] = 0 X[user] += nist_data[class_id][idx[class_id]:( idx[class_id] + num_sample_per_class)] y[user] += (class_id * np.ones(num_sample_per_class)).tolist() idx[class_id] += num_sample_per_class # Create data structure train_data = {'users': [], 'user_data': {}, 'num_samples': []} test_data = {'users': [], 'user_data': {}, 'num_samples': []} for i in trange(NUM_USER, ncols=120): uname = 'f_{0:05d}'.format(i) combined = list(zip(X[i], y[i])) random.shuffle(combined) X[i][:], y[i][:] = zip(*combined) num_samples = len(X[i]) train_len = int(0.9 * num_samples) test_len = num_samples - train_len train_data['users'].append(uname) train_data['user_data'][uname] = { 'x': X[i][:train_len], 'y': y[i][:train_len] } train_data['num_samples'].append(train_len) test_data['users'].append(uname) test_data['user_data'][uname] = { 'x': X[i][train_len:], 'y': y[i][train_len:] } test_data['num_samples'].append(test_len) with open(train_path, 'w') as outfile: json.dump(train_data, outfile) with open(test_path, 'w') as outfile: json.dump(test_data, outfile)
datetime.timedelta(hours=4), datetime.timedelta(days=1) ] alltime_dfs = {} for ashi, tmd in zip(ashis, tmds): df = pd.read_csv(rh_root + "/alltime/market_" + ashi + ".csv", parse_dates=True) df["openTime"] = pd.to_datetime(df.openTime) df["closeTime"] = df.openTime.shift(-1) df.closeTime[len(df) - 1] = df.openTime[len(df) - 1] + tmd alltime_dfs[ashi] = df df = alltime_dfs["h01"] for weeki in trange(104, 180): kireme_d = datetime.datetime(year=2018, month=1, day=7) + datetime.timedelta(days=7) * weeki if kireme_d >= datetime.datetime(year=2020, month=12, day=29): break owbf_d = kireme_d - datetime.timedelta(days=7) dfs = {} for ashi in ashis: df = alltime_dfs[ashi] kireme_x = len(df[df.openTime <= kireme_d]) owbf_x = len(df[df.openTime <= owbf_d]) df = df[max(owbf_x - yoyuu, 0):kireme_x] df = df.reset_index(drop=True) df["openX"] = df.index df["closeX"] = df.openX + 1
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=25) optimizer = torch.optim.Adam(model.parameters(), lr=1.e-3) loss_function = torch.nn.NLLLoss() loss_function.to(device) tb_logger = torch.utils.tensorboard.SummaryWriter('runs/log_mlp') # %tensorboard --logdir runs # train for a couple of epochs n_epochs = 4 for epoch in trange(n_epochs): utils.train(model, train_loader, loss_function, optimizer, device, epoch, tb_logger=tb_logger) step = (epoch + 1) * len(train_loader) utils.validate(model, val_loader, loss_function, device, step, tb_logger=tb_logger)
def PMF(X: np.ndarray, d: int, l: float, s: float = 1, max_iter: int = 100, print_cost: int = 0, pretrained_u: np.ndarray = None, pretrained_v: np.ndarray = None) -> tuple: """ Probabilistic Matrix Factorization algorithm implementation :param X: an observed matrix, dim(X) = n x m :param l: regularization parameter (lambda) :param s: regularization parameter (standard deviation) :param d: number of latent features :param max_iter: maximum iterations of the algorithm, default 100 :param print_cost: if > 0 prints cost function every n-th iteration of the algorithm, if = 0 does not print the cost at all :param pretrained_u: if you want to run more epochs with pretrained matrices, please, specify both; default None :param pretrained_v: see param pretrained_u :return: U, V - latent features matrices with dim(U) = d x n, dim(V) = d X m """ n, m = X.shape ind = np.ones(X.shape) ind[X == 0] = 0 # indicator matrix # initialize latent features matrices if pretrained_u is None and pretrained_v is None: v = np.random.normal(0, 1 / l, (d, m)) u = np.zeros((d, n)) else: v = pretrained_v u = pretrained_u # suggestion from the lecturer: Omega_u = [list(np.where(X[i, :] > 0)[0]) for i in range(n) ] # Omega_u[i] - масив індексів, для яких M_ij - observed Omega_v = [list(np.where(X[:, j] > 0)[0]) for j in range(m)] # iterate through u,v for k in range(max_iter): # calculate U for i in trange(n): u[:, i] = ((1 / (l * s**2 + np.array([ ind[i, j] * np.linalg.norm(v[:, j].reshape(d, 1), ord='fro')**2 for j in Omega_u[i] ]).sum()) * (v @ (ind * X)[i, :].reshape((1, m)).T)).reshape( (d, ))) # calculate V for j in trange(m): v[:, j] = ((1 / (l * s**2 + np.array([ ind[i, j] * np.linalg.norm(u[:, i].reshape(d, 1), ord='fro')**2 for i in Omega_v[j] ]).sum()) * (u @ (ind * X)[:, j].reshape((1, n)).T)).reshape( (d, ))) # save this for later # np.save('/home/olga/Projects/HW_ML/lab4/experiment/u.npy', u) # np.save('/home/olga/Projects/HW_ML/lab4/experiment/v.npy', v) # compute cost cost = ((s**(-2)) * np.linalg.norm(ind * (X - u.T @ v), ord='fro')**2 + l * np.linalg.norm(u, ord='fro')**2 + l * np.linalg.norm(v, ord='fro')**2) / 2 if print_cost and (k + 1) % print_cost == 0: print(f"{k+1} iteration cost: {np.log(cost):.5f}") return u, v
def main(): parser = get_argument_parser() deepspeed.init_distributed(dist_backend='nccl') args.local_rank = int(os.environ['LOCAL_RANK']) # Include DeepSpeed configuration arguments parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model # model = BertForQuestionAnswering.from_pretrained(args.bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) # Support for word embedding padding checkpoints # Prepare model bert_model_config = { "vocab_size_or_config_json_file": 119547, "hidden_size": 1024, "num_hidden_layers": 24, "num_attention_heads": 16, "intermediate_size": 4096, "hidden_act": "gelu", "hidden_dropout_prob": args.dropout, "attention_probs_dropout_prob": args.dropout, "hidden_dropout_prob": 0.1, "attention_probs_dropout_prob": 0.1, "max_position_embeddings": 512, "type_vocab_size": 2, "initializer_range": 0.02 } if args.ckpt_type == "DS": if args.preln: bert_config = BertConfigPreLN(**bert_model_config) else: bert_config = BertConfig(**bert_model_config) else: # Models from Tensorflow and Huggingface are post-LN. if args.preln: raise ValueError("Should NOT use --preln if the loading checkpoint doesn't use pre-layer-norm.") # Use the original bert config if want to load from non-DeepSpeed checkpoint. if args.origin_bert_config_file is None: raise ValueError("--origin_bert_config_file is required for loading non-DeepSpeed checkpoint.") bert_config = BertConfig.from_json_file(args.origin_bert_config_file) if bert_config.vocab_size != len(tokenizer.vocab): raise ValueError("vocab size from original checkpoint mismatch.") bert_config.vocab_size = len(tokenizer.vocab) # Padding for divisibility by 8 if bert_config.vocab_size % 8 != 0: vocab_diff = 8 - (bert_config.vocab_size % 8) bert_config.vocab_size += vocab_diff if args.preln: model = BertForQuestionAnsweringPreLN(bert_config, args) else: model = BertForQuestionAnswering(bert_config, args) print("VOCAB SIZE:", bert_config.vocab_size) if args.model_file is not "0": logger.info(f"Loading Pretrained Bert Encoder from: {args.model_file}") if args.ckpt_type == "DS": checkpoint_state_dict = torch.load(args.model_file, map_location=torch.device("cpu")) if 'module' in checkpoint_state_dict: logger.info('Loading DeepSpeed v2.0 style checkpoint') model.load_state_dict(checkpoint_state_dict['module'], strict=False) elif 'model_state_dict' in checkpoint_state_dict: model.load_state_dict(checkpoint_state_dict['model_state_dict'], strict=False) else: raise ValueError("Unable to find model state in checkpoint") else: from convert_bert_ckpt_to_deepspeed import convert_ckpt_to_deepspeed convert_ckpt_to_deepspeed(model, args.ckpt_type, args.model_file, vocab_diff, args.deepspeed_transformer_kernel) logger.info(f"Pretrained Bert Encoder Loaded from: {args.model_file}") # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 },{ 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] model, optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=optimizer_grouped_parameters, dist_init_required=True) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs #torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: os.makedirs(args.output_dir, exist_ok=True) # Prepare Summary writer if torch.distributed.get_rank() == 0 and args.job_name is not None: args.summary_writer = get_summary_writer(name=args.job_name, base=args.output_dir) else: args.summary_writer = None logger.info("propagate deepspeed-config settings to client settings") args.train_batch_size = model.train_micro_batch_size_per_gpu() args.gradient_accumulation_steps = model.gradient_accumulation_steps() args.fp16 = model.fp16_enabled() args.print_steps = model.steps_per_print() args.learning_rate = model.get_lr()[0] args.wall_clock_breakdown = model.wall_clock_breakdown() t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() global_step = 0 if args.do_train: cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() ema_loss = 0. sample_count = 0 num_epoch = 0 all_step_time = 0.0 ave_rounds = 20 for _ in trange(int(args.num_train_epochs), desc="Epoch"): num_epoch += 1 epoch_step = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", smoothing=0)): start_time = time.time() bs_size = batch[0].size()[0] if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps ema_loss = args.loss_plot_alpha * ema_loss + ( 1 - args.loss_plot_alpha) * loss.item() model.backward(loss) loss_item = loss.item() * args.gradient_accumulation_steps loss = None sample_count += (args.train_batch_size * torch.distributed.get_world_size()) if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step model.step() global_step += 1 epoch_step += 1 if torch.distributed.get_rank( ) == 0 and args.summary_writer: summary_events = [ (f'Train/Steps/lr', lr_this_step, global_step), (f'Train/Samples/train_loss', loss_item, sample_count), (f'Train/Samples/lr', lr_this_step, sample_count), (f'Train/Samples/train_ema_loss', ema_loss, sample_count) ] if args.fp16 and hasattr(optimizer, 'cur_scale'): summary_events.append( (f'Train/Samples/scale', optimizer.cur_scale, sample_count)) write_summary_events(args.summary_writer, summary_events) args.summary_writer.flush() if torch.distributed.get_rank() == 0 and ( step + 1) % args.print_steps == 0: logger.info( f"bert_squad_progress: step={global_step} lr={lr_this_step} loss={ema_loss}" ) else: model.step() if is_time_to_exit(args=args, epoch_steps=epoch_step, global_steps=global_step): logger.info( f'Warning: Early epoch termination due to max steps limit, epoch step ={epoch_step}, global step = {global_step}, epoch = {num_epoch}' ) break one_step_time = time.time() -start_time all_step_time += one_step_time if (step + 1)%(ave_rounds) == 0 and torch.distributed.get_rank() == 0: print('At Step {}, Averaged Throughput for {} rounds is: {} Samples/s'.format(step, ave_rounds, bs_size * ave_rounds * torch.distributed.get_world_size() / all_step_time ), flush=True ) all_step_time = 0.0 # Save a trained model # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self #output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") # if args.do_train: # torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned #model_state_dict = torch.load(output_model_file) #model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) # model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def main(): assert pyro.__version__.startswith('1.6.0') # Enable smoke test to test functionality smoke_test = False logging.info(f"CUDA available: {torch.cuda.is_available()}") # Loading data logging.info("Loading data...") docs = prepro_file_load("doc_word_matrix").to_dense() doc_categories = prepro_file_load("doc_cat_one_hot_matrix") # doc_categories = torch.t(torch.reshape(torch.Tensor(list(prepro_file_load("doc2category").values())), (1, -1))) id2word = prepro_file_load("id2word") id2cat = prepro_file_load("id2category") # Put vocab into dataframe for exploration of data vocab = pd.DataFrame(columns=['index', 'word']) vocab['index'] = list(id2word.keys()) vocab['word'] = list(id2word.values()) logging.info(f"Vocab dictionary size: {len(vocab)}") logging.info(f"Corpus size: {docs.shape}") # Setting global variables seed = 0 torch.manual_seed(seed) pyro.set_rng_seed(seed) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") docs = docs.float() doc_categories = doc_categories.float() num_categories = len(id2cat) num_topics = num_categories * 2 if not smoke_test else 3 batch_size = 32 learning_rate = 1e-3 num_epochs = 50 if not smoke_test else 1 # Training pyro.clear_param_store() prodLDA = CategoryProdLDA(vocab_size=docs.shape[1], num_topics=num_topics, num_categories=num_categories, hidden=100 if not smoke_test else 10, dropout=0.2) prodLDA.to(device) optimizer = pyro.optim.Adam({"lr": learning_rate}) svi = SVI(prodLDA.model, prodLDA.guide, optimizer, loss=TraceMeanField_ELBO()) num_batches = int(math.ceil(docs.shape[0] / batch_size)) if not smoke_test else 1 losses = [] logging.info("Training...") bar = trange(num_epochs) for epoch in bar: running_loss = 0.0 for i in range(num_batches): batch_docs = docs[i * batch_size:(i + 1) * batch_size, :].to(device) batch_cats = doc_categories[i * batch_size:(i + 1) * batch_size, :].to(device) loss = svi.step(batch_docs, batch_cats) running_loss += loss / batch_docs.size(0) # Save and log losses losses.append(running_loss) bar.set_postfix(epoch_loss='{:.2e}'.format(running_loss)) if epoch % 5 == 0: logging.info('{: >5d}\t{}'.format(epoch, '{:.2e}'.format(running_loss))) logging.info(f"Final loss: {'{:.2e}'.format(losses[-1])}/{losses[-1]}") if not smoke_test: # Plot loss over epochs plt.plot(losses) plt.title("ELBO") plt.xlabel("Epoch") plt.ylabel("Loss") plot_file_name = "../ProdCategoryLDA-loss-2017_categories-" + str(num_categories) + \ "_topics-" + str(num_topics) + \ "_batch-" + str(batch_size) + \ "_lr-" + str(learning_rate) + \ "_epochs-" + str(num_epochs) + \ ".png" plt.savefig(plot_file_name) plt.show() # Logging top 10 weighted words in topics beta = prodLDA.beta() for n in range(beta.shape[0]): sorted_, indices = torch.sort(beta[n], descending=True) df = pd.DataFrame(indices[:10].numpy(), columns=['index']) words = pd.merge(df, vocab[['index', 'word']], how='left', on='index')['word'].values.tolist() logging.info(f"Topic {n}: {words}")
def train(train_dataset, model, tokenizer, hyperparams): verbose = hyperparams["verbose"] disable = False if verbose else True local_rank = hyperparams["local_rank"] per_gpu_train_batch_size = hyperparams["per_gpu_train_batch_size"] n_gpu = hyperparams["n_gpu"] max_steps = hyperparams["max_steps"] num_train_epochs = hyperparams["num_train_epochs"] gradient_accumulation_steps = hyperparams["gradient_accumulation_steps"] weight_decay = hyperparams["weight_decay"] learning_rate = hyperparams["learning_rate"] adam_epsilon = hyperparams["adam_epsilon"] warmup_steps = hyperparams["warmup_steps"] seed = hyperparams["random_state"] device = hyperparams["device"] model_type = hyperparams["model_type"] max_grad_norm = hyperparams["max_grad_norm"] save_steps = hyperparams['save_steps'] output_dir = hyperparams["output_dir"] log_path = os.path.join(output_dir, "log.csv") fp16_opt_level = hyperparams["fp16_opt_level"] fp16 = hyperparams["fp16"] model_name_or_path = hyperparams["model_name_or_path"] opt_path = os.path.join(model_name_or_path, "optimizer.pt") sche_path = os.path.join(model_name_or_path, "scheduler.pt") training_logs = {"loss": [], "learning_rate": []} train_batch_size = per_gpu_train_batch_size * max(1, n_gpu) if local_rank == -1: train_sampler = RandomSampler(train_dataset) else: DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size) if max_steps > 0: t_total = max_steps num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader) // gradient_accumulation_steps * num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(opt_path) and os.path.isfile(sche_path): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(opt_path)) scheduler.load_state_dict(torch.load(sche_path)) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) # Train! logging.info("***** Running training *****") logging.info(" Num examples = %d", len(train_dataset)) logging.info(" Num Epochs = %d", num_train_epochs) logging.info(" Instantaneous batch size per GPU = %d", per_gpu_train_batch_size) logging.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", train_batch_size * gradient_accumulation_steps * (torch.distributed.get_world_size() if local_rank != -1 else 1)) logging.info(" Gradient Accumulation steps = %d", gradient_accumulation_steps) logging.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists( model_name_or_path) and model_name_or_path.find("checkpoints") > 0: # set global_step to gobal_step of last saved checkpoint from model # path global_step = int(model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // gradient_accumulation_steps) logging.info( " Continuing training from checkpoint, will skip to saved global_step" ) logging.info(" Continuing training from epoch %d", epochs_trained) logging.info(" Continuing training from global step %d", global_step) logging.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss = 0.0 model.zero_grad() set_seed(seed, n_gpu=n_gpu) # Added here for reproductibility train_iterator = trange(epochs_trained, int(num_train_epochs), desc="Epoch", disable=disable) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=disable) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids if model_type != "distilbert": inputs["token_type_ids"] = (batch[2] if model_type in [ "bert", "xlnet", "albert" ] else None) outputs = model(**inputs) loss = outputs[0] if n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() training_logs["loss"].append(loss.item()) training_logs["learning_rate"].append(scheduler.get_last_lr()[0]) if (step + 1) % gradient_accumulation_steps == 0: if fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if local_rank in [ -1, 0 ] and save_steps > 0 and global_step % save_steps == 0: # Save model checkpoint output_dir = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = (model.module if hasattr(model, "module") else model) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save( hyperparams, os.path.join(output_dir, "training_hyperparams.bin")) logging.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logging.info("Saving optimizer and scheduler states to %s", output_dir) if max_steps > 0 and global_step > max_steps: epoch_iterator.close() break if max_steps > 0 and global_step > max_steps: train_iterator.close() break training_logs = pd.DataFrame(training_logs) training_logs.to_csv(log_path, index=False) return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: # here set log dir for visulization tb_writer = SummaryWriter(comment=args.log_comment) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # if freeze BERT parameters if args.freeze_bert: for params in model.bert.parameters(): params.requires_grad = False optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if args.scheduler == "linear": scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) elif args.scheduler == "cosine": scheduler = WarmupCosineSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) args.logging_steps = t_total // args.num_train_epochs logger.info(" Logging steps = %d", args.logging_steps) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'start_positions': batch[3], 'end_positions': batch[4], 'concept_ids': batch[7], 'concept_masks': batch[8]} if args.model_type != 'distilbert': inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[5], 'p_mask': batch[6]}) outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model): # use tensorboard to keep track of training process tb_writer = SummaryWriter('loss') #train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=len(train_dataloader) * args.num_train_epochs) # Train! logging.info("***** Running training *****") #logging.info(" Num examples = %d", len(train_dataset)) logging.info(" Num Epochs = %d", args.num_train_epochs) logging.info(" Let's start finetuning!") tr_loss, logging_loss = 0.0, 0.0 global_step = 0 for epoch in tqdm.trange(args.num_train_epochs, desc='Epoch'): #epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in tqdm.tqdm(enumerate(train_dataloader)): model.train() outputs = model( is_training=True, input_ids=batch["input_ids"].long().to(DEVICE), attention_mask=batch['input_mask'].long().to(DEVICE), token_type_ids=batch['segment_ids'].long().to(DEVICE), start_positions=batch["start_positions"].long().to(DEVICE), end_positions=batch["end_positions"].long().to(DEVICE), answer_types=batch["answer_types"].long().to(DEVICE)) loss = outputs[-1] if args.grad_acc_steps > 1: loss = loss / args.grad_acc_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.grad_acc_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % args.logging_steps == 0: tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) print('loss', (tr_loss - logging_loss) / args.logging_steps) logging_loss = tr_loss #empty cahce del batch torch.cuda.empty_cache() #loggin points # save checkpoint #if global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, "checkpoint-3{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logging.info("Saving model checkpoint to %s", output_dir) #torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) #logging.info("Saving optimizer and scheduler states to %s", output_dir) return global_step, tr_loss / global_step
def run(self, thunk, num_cpu=1, data_dir=None, datestamp=False): """ Run each variant in the grid with function 'thunk'. Note: 'thunk' must be either a callable function, or a string. If it is a string, it must be the name of a parameter whose values are all callable functions. Uses ``call_experiment`` to actually launch each experiment, and gives each variant a name using ``self.variant_name()``. Maintenance note: the args for ExperimentGrid.run should track closely to the args for call_experiment. However, ``seed`` is omitted because we presume the user may add it as a parameter in the grid. """ # Print info about self. self.print() # Make the list of all variants. variants = self.variants() # Print variant names for the user. var_names = set([self.variant_name(var) for var in variants]) var_names = sorted(list(var_names)) line = '='*DIV_LINE_WIDTH preparing = colorize('Preparing to run the following experiments...', color='green', bold=True) joined_var_names = '\n'.join(var_names) announcement = f"\n{preparing}\n\n{joined_var_names}\n\n{line}" print(announcement) if WAIT_BEFORE_LAUNCH > 0: delay_msg = colorize(dedent(""" Launch delayed to give you a few seconds to review your experiments. To customize or disable this behavior, change WAIT_BEFORE_LAUNCH in spinup/user_config.py. """), color='cyan', bold=True)+line print(delay_msg) wait, steps = WAIT_BEFORE_LAUNCH, 100 prog_bar = trange(steps, desc='Launching in...', leave=False, ncols=DIV_LINE_WIDTH, mininterval=0.25, bar_format='{desc}: {bar}| {remaining} {elapsed}') for _ in prog_bar: time.sleep(wait/steps) # Run the variants. for var in variants: exp_name = self.variant_name(var) # Figure out what the thunk is. if isinstance(thunk, str): # Assume one of the variant parameters has the same # name as the string you passed for thunk, and that # variant[thunk] is a valid callable function. thunk_ = var[thunk] del var[thunk] else: # Assume thunk is given as a function. thunk_ = thunk call_experiment(exp_name, thunk_, num_cpu=num_cpu, data_dir=data_dir, datestamp=datestamp, **var)
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs warm_up_steps=int(args.warmup_steps*t_total) logging_steps=int(args.logging_steps*t_total) save_steps=int(args.save_steps*t_total) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warm_up_steps, t_total=t_total) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 max_acc=0 max_f1=0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed(args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'align_mask': batch[2], 'labels': batch[4]} inputs['token_type_ids'] = batch[3] outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if logging_steps > 0 and global_step % logging_steps == 0: if args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): if key=="acc": max_acc=max([max_acc,value]) with open(os.path.join(args.output_dir, "acc.txt"), 'a+') as w: w.write("%d\t%f\t%f\n" % (global_step, value, max_acc)) if key == "f1": max_f1=max([max_f1,value]) with open(os.path.join(args.output_dir, "f1.txt"), 'a+') as w: w.write("%d\t%f\t%f\n" % (global_step, value, max_f1)) with open(os.path.join(args.output_dir, "loss.txt"), 'a+') as w: w.write("%d\t%f\n"%(global_step, (tr_loss - logging_loss) / logging_steps)) logging_loss = tr_loss if save_steps > 0 and global_step % save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ tb_writer = SummaryWriter() train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Train batch size = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Gradient accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 #steps_trained_in_current_epoch = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", position=0, leave=True, ncols=100) set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", position=0, leave=True, ncols=100) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'start_positions': batch[3], 'end_positions': batch[4], 'cls_index': batch[5], 'p_mask': batch[6], 'task': 2, } outputs = model(**inputs) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.evaluate_during_training: results = evalute(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logging.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close() return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() processors = { "cola": ColaProcessor, "snli": SnliProcessor, "mrpc": MrpcProcessor, "ant": AntProcessor, 'buy_data': BuyProcessor, 'buy_mt': BuyMTProcessor, 'sent_clf': SingleSentProcessor, 'douban': DoubanProcessor, 'keyword': KeywordProcessor, } num_labels_task = { "cola": 2, "snli": 3, "mrpc": 2, "ant": 2, 'buy_data': 2, 'buy_mt': 2, 'sent_clf': 2, 'douban': 2, 'keyword': 2, } global device if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) global processor global label_list global tokenizer processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format( args.local_rank), num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: # from apex.parallel import DistributedDataParallel as DDP from torch.nn.parallel import DistributedDataParallel as DDP # 不采用nvidia的apex包 except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global global_step global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() epoch_idx = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch_idx += 1 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 start = time.time() for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() # if step % 10000 == 0: # print('step {} | loss {} | spend {} s'.format(step, loss, time.time() - start)) # start = time.time() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME + '_epoch{}'.format(epoch_idx)) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval(model, args, epoch_idx, tr_loss / nb_tr_steps) else: # Load a trained model that you have fine-tuned if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval(model, args, -10000, -10000)
def train_model(rank, world_size, args): """ 모델 학습 """ if 1 < args.n_gpu: init_process_group(rank, world_size) master = (world_size == 0 or rank % world_size == 0) if master and args.wandb: wandb.init(project=args.project) vocab = load_vocab(args.vocab) config = Config.load(args.config) config.n_enc_vocab = len(vocab) config.device = f"cuda:{rank}" if torch.cuda.is_available() else "cpu" print(config) best_epoch, best_loss, best_score = 0, 0, 0 train_model = ALBertTrainMovie(config) if os.path.isfile(args.save): try: best_epoch, best_loss, best_score = train_model.load(args.save) print( f"rank: {rank} load state dict from: {os.path.basename(args.save)}" ) except: print(f'load {os.path.basename(args.save)} failed.') elif os.path.isfile(args.pretrain_save): try: epoch, loss = train_model.bert.load(args.pretrain_save) print( f"rank: {rank} load pretrain from: {os.path.basename(args.pretrain_save)}, epoch={epoch}, loss={loss}" ) except: print(f'load {os.path.basename(args.pretrain_save)} failed.') if 1 < args.n_gpu: train_model.to(config.device) # noinspection PyArgumentList train_model = DistributedDataParallel(train_model, device_ids=[rank], find_unused_parameters=True) else: train_model.to(config.device) if master and args.wandb: wandb.watch(train_model) criterion_cls = torch.nn.CrossEntropyLoss() train_loader: DataLoader = data.build_data_loader(vocab, args.train, args, data_type='train', shuffle=True) test_loader: DataLoader = data.build_data_loader(vocab, args.test, args, data_type='test', shuffle=False) t_total = len(train_loader) * args.epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in train_model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in train_model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon) scheduler = optim.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total) start_epoch = best_epoch + 1 with trange(args.epoch, desc="Epoch", position=0) as pbar: pbar.set_postfix_str( f"best epoch: {best_epoch}, loss: {best_loss:.4f}, accuracy: {best_score:.3f}" ) for step in pbar: epoch = step + start_epoch loss = train_epoch(config, rank, train_model, criterion_cls, optimizer, scheduler, train_loader) score = eval_epoch(config, rank, train_model, test_loader) if master and args.wandb: wandb.log({"loss": loss, "accuracy": score}) if master and best_score < score: best_epoch, best_loss, best_score = epoch, loss, score if isinstance(train_model, DistributedDataParallel): train_model.module.save(best_epoch, best_loss, best_score, args.save) else: train_model.save(best_epoch, best_loss, best_score, args.save) pbar.set_postfix_str( f"best epoch: {best_epoch}, loss: {best_loss:.4f}, accuracy: {best_score:.3f}" ) if 1 < args.n_gpu: destroy_process_group()
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch, ) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) # Added here for reproductibility set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr(model.config, "lang2id"): inputs.update( {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} ) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar( "loss", (tr_loss - logging_loss) / args.logging_steps, global_step, ) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model if args.train_adapter: model_to_save.save_all_adapters(output_dir) else: model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) # kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=1, pin_memory=True) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs if args.cuda: # Move model to GPU. model.cuda() # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate * hvd.size(), eps=args.adam_epsilon) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) print("############# Start to create model ###################") # multi-gpu training (should be after apex fp16 initialization) # if args.n_gpu > 1: # print("############# DataParallel ###################") # model = torch.nn.DataParallel(model) # # Distributed training (should be after apex fp16 initialization) # if args.local_rank != -1: # print("############# DistributedDataParallel ###################") # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], # output_device=args.local_rank, # find_unused_parameters=True) # else: # print("############# Normal ###################") # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (hvd.size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproducibility (even between python 2 and 3) for _ in range(int(args.num_train_epochs)): epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) step_count = 0 aa = time.time() for step, batch in enumerate(train_dataloader): a = time.time() inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs, labels = inputs.cuda(), labels.cuda() # inputs = inputs.to(args.device) # labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = 'checkpoint' # Save model checkpoint output_dir = os.path.join( args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) step_count += 1 if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break b = time.time() if step_count % 100 == 20: print("***** Training time: ", b - a, "; Step: ", step_count, "; loss: ", tr_loss / step_count, "*****") bb = time.time() print("***** Total Training time: ", bb - aa, "; Step: ", step_count, "; loss: ", tr_loss / step_count, "*****") if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if ( args.model_name_or_path and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 print(f"Step: {step}, Loss: {loss.item()}") if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(args.exp_name) encoder.zero_grad() model.zero_grad() ###++++++++++++++++++++++++++++++++++++++++++ total_batch_num = len(train_dataloader) logger.info('Total number of batches = {}'.format(total_batch_num)) eval_batch_interval_num = int(total_batch_num * args.eval_interval_ratio) + 1 logger.info( 'Evaluate the model by = {} batches'.format(eval_batch_interval_num)) ###++++++++++++++++++++++++++++++++++++++++++ train_iterator = trange(start_epoch, start_epoch + int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): encoder.train() model.train() #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ for key, value in batch.items(): if key not in {'ids'}: batch[key] = value.to(args.device) #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ inputs = { 'input_ids':
mnist.data[:60000] = mnist.data[reorder_train] mnist.target[:60000] = mnist.target[reorder_train] mnist.data[60000:] = mnist.data[reorder_test + 60000] mnist.target[60000:] = mnist.target[reorder_test + 60000] # Get MNIST data, normalize, and divide by level # mnist = fetch_openml('MNIST original', data_home='./data') mnist = fetch_openml('mnist_784', version=1, cache=True) mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings sort_by_target(mnist) # fetch_openml() returns an unsorted dataset mu = np.mean(mnist.data.astype(np.float32), 0) sigma = np.std(mnist.data.astype(np.float32), 0) mnist.data = (mnist.data.astype(np.float32) - mu)/(sigma+0.001) mnist_data = [] for i in trange(10): idx = mnist.target==i mnist_data.append(mnist.data[idx]) print([len(v) for v in mnist_data]) ###### CREATE USER DATA SPLIT ####### # Assign 10 samples to each user X = [[] for _ in range(1000)] y = [[] for _ in range(1000)] idx = np.zeros(10, dtype=np.int64) for user in range(1000): for j in range(2): l = (user+j)%10 X[user] += mnist_data[l][idx[l]:idx[l]+5].tolist() y[user] += (l*np.ones(5)).tolist()
def train(args, train_dataset, model, tokenizer, criterion): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn, num_workers=args.num_workers, ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_f1, n_no_improve = 0, 0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) labels = batch[5] inputs = { "input_ids": batch[0], "input_modal": batch[2], "attention_mask": batch[1], "modal_start_tokens": batch[3], "modal_end_tokens": batch[4], } outputs = model(**inputs) logits = outputs[0] # model outputs are always tuple in transformers (see doc) loss = criterion(logits, labels) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, criterion) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{"step": global_step}})) if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME)) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank == -1: results = evaluate(args, model, tokenizer, criterion) if results["micro_f1"] > best_f1: best_f1 = results["micro_f1"] n_no_improve = 0 else: n_no_improve += 1 if n_no_improve > args.patience: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(params): model_output_path = params["output_path"] if not os.path.exists(model_output_path): os.makedirs(model_output_path) logger = utils.get_logger(params["output_path"]) # Init model reranker = CrossEncoderRanker(params) tokenizer = reranker.tokenizer model = reranker.model # utils.save_model(model, tokenizer, model_output_path) device = reranker.device n_gpu = reranker.n_gpu if params["gradient_accumulation_steps"] < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(params["gradient_accumulation_steps"])) # An effective batch size of `x`, when we are accumulating the gradient accross `y` batches will be achieved by having a batch size of `z = x / y` # args.gradient_accumulation_steps = args.gradient_accumulation_steps // n_gpu params["train_batch_size"] = (params["train_batch_size"] // params["gradient_accumulation_steps"]) train_batch_size = params["train_batch_size"] eval_batch_size = params["eval_batch_size"] grad_acc_steps = params["gradient_accumulation_steps"] # Fix the random seeds seed = params["seed"] random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if reranker.n_gpu > 0: torch.cuda.manual_seed_all(seed) max_seq_length = params["max_seq_length"] context_length = params["max_context_length"] fname = os.path.join(params["data_path"], "train.t7") train_data = torch.load(fname) context_input = train_data["context_vecs"] candidate_input = train_data["cand_vecs"] label_input = train_data["labels"] if params["debug"]: max_n = 200 context_input = context_input[:max_n] candidate_input = candidate_input[:max_n] label_input = label_input[:max_n] context_input = modify(context_input, candidate_input, max_seq_length) train_tensor_data = TensorDataset(context_input, label_input) train_sampler = RandomSampler(train_tensor_data) train_dataloader = DataLoader(train_tensor_data, sampler=train_sampler, batch_size=params["train_batch_size"]) max_n = 2048 if params["debug"]: max_n = 200 fname = os.path.join(params["data_path"], "valid.t7") valid_data = torch.load(fname) context_input = valid_data["context_vecs"][:max_n] candidate_input = valid_data["cand_vecs"][:max_n] label_input = valid_data["labels"][:max_n] context_input = modify(context_input, candidate_input, max_seq_length) valid_tensor_data = TensorDataset(context_input, label_input) valid_sampler = SequentialSampler(valid_tensor_data) valid_dataloader = DataLoader(valid_tensor_data, sampler=valid_sampler, batch_size=params["eval_batch_size"]) # evaluate before training results = evaluate( reranker, valid_dataloader, device=device, logger=logger, context_length=context_length, silent=params["silent"], ) number_of_samples_per_dataset = {} time_start = time.time() utils.write_to_file(os.path.join(model_output_path, "training_params.txt"), str(params)) logger.info("Starting training") logger.info("device: {} n_gpu: {}, distributed training: {}".format( device, n_gpu, False)) optimizer = get_optimizer(model, params) scheduler = get_scheduler(params, optimizer, len(train_tensor_data), logger) model.train() best_epoch_idx = -1 best_score = -1 num_train_epochs = params["num_train_epochs"] for epoch_idx in trange(int(num_train_epochs), desc="Epoch"): tr_loss = 0 results = None if params["silent"]: iter_ = train_dataloader else: iter_ = tqdm(train_dataloader, desc="Batch") part = 0 for step, batch in enumerate(iter_): batch = tuple(t.to(device) for t in batch) context_input, label_input = batch loss, _ = reranker(context_input, label_input, context_length) # if n_gpu > 1: # loss = loss.mean() # mean() to average on multi-gpu. if grad_acc_steps > 1: loss = loss / grad_acc_steps tr_loss += loss.item() if (step + 1) % (params["print_interval"] * grad_acc_steps) == 0: logger.info("Step {} - epoch {} average loss: {}\n".format( step, epoch_idx, tr_loss / (params["print_interval"] * grad_acc_steps), )) tr_loss = 0 loss.backward() if (step + 1) % grad_acc_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), params["max_grad_norm"]) optimizer.step() scheduler.step() optimizer.zero_grad() if (step + 1) % (params["eval_interval"] * grad_acc_steps) == 0: logger.info("Evaluation on the development dataset") evaluate( reranker, valid_dataloader, device=device, logger=logger, context_length=context_length, silent=params["silent"], ) logger.info("***** Saving fine - tuned model *****") epoch_output_folder_path = os.path.join( model_output_path, "epoch_{}_{}".format(epoch_idx, part)) part += 1 utils.save_model(model, tokenizer, epoch_output_folder_path) model.train() logger.info("\n") logger.info("***** Saving fine - tuned model *****") epoch_output_folder_path = os.path.join(model_output_path, "epoch_{}".format(epoch_idx)) utils.save_model(model, tokenizer, epoch_output_folder_path) # reranker.save(epoch_output_folder_path) output_eval_file = os.path.join(epoch_output_folder_path, "eval_results.txt") results = evaluate( reranker, valid_dataloader, device=device, logger=logger, context_length=context_length, silent=params["silent"], ) ls = [best_score, results["normalized_accuracy"]] li = [best_epoch_idx, epoch_idx] best_score = ls[np.argmax(ls)] best_epoch_idx = li[np.argmax(ls)] logger.info("\n") execution_time = (time.time() - time_start) / 60 utils.write_to_file( os.path.join(model_output_path, "training_time.txt"), "The training took {} minutes\n".format(execution_time), ) logger.info("The training took {} minutes\n".format(execution_time)) # save the best model in the parent_dir logger.info("Best performance in epoch: {}".format(best_epoch_idx)) params["path_to_model"] = os.path.join(model_output_path, "epoch_{}".format(best_epoch_idx))
import numpy as np from skimage.data import imread import matplotlib.pyplot as plt from skimage import io #Get all file list from os import listdir directory = "realpages/" file_list = listdir(directory) save_directory = "croppages/" #Loop all files for i in trange(len(file_list)): #for i in range(0, 10): #Read data image_file = directory + file_list[i] im = imread(image_file) #plt.figure(figsize=(30, 20)) #plt.imshow(im, cmap='gray') #Get shape shape = im.shape #print(shape) crop_im = im[17:shape[0], 10:shape[1] - 17] #plt.figure(figsize=(30, 20)) #plt.imshow(crop_im, cmap='gray')
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--src_file", default=None, type=str, help="The input data file name.") parser.add_argument("--tgt_file", default=None, type=str, help="The output data file name.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--config_path", default=None, type=str, help="Bert config file path.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--log_dir", default='', type=str, required=True, help="The output directory where the log will be written.") parser.add_argument("--model_recover_path", default=None, type=str, required=True, help="The file of fine-tuned pretraining model.") parser.add_argument("--optim_recover_path", default=None, type=str, help="The file of pretraining optimizer.") # Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--label_smoothing", default=0, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="The weight decay rate for Adam.") parser.add_argument("--finetune_decay", action='store_true', help="Weight decay to the original weights.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--hidden_dropout_prob", default=0.1, type=float, help="Dropout rate for hidden states.") parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float, help="Dropout rate for attention probabilities.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp32_embedding', action='store_true', help= "Whether to use 32-bit float precision instead of 16-bit for embeddings" ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument( '--from_scratch', action='store_true', help= "Initialize parameters with random values (i.e., training from scratch)." ) parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--new_pos_ids', action='store_true', help="Use new position ids for LMs.") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument('--max_len_a', type=int, default=0, help="Truncate_config: maximum length of segment A.") parser.add_argument('--max_len_b', type=int, default=0, help="Truncate_config: maximum length of segment B.") parser.add_argument( '--trunc_seg', default='', help="Truncate_config: first truncate segment A/B (option: a, b).") parser.add_argument( '--always_truncate_tail', action='store_true', help="Truncate_config: Whether we should always truncate tail.") parser.add_argument( "--mask_prob", default=0.15, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument( "--mask_prob_eos", default=0, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument('--max_pred', type=int, default=20, help="Max tokens of prediction.") parser.add_argument("--num_workers", default=0, type=int, help="Number of workers for the data loader.") parser.add_argument('--mask_source_words', action='store_true', help="Whether to mask source words for training") parser.add_argument('--skipgram_prb', type=float, default=0.0, help='prob of ngram mask') parser.add_argument('--skipgram_size', type=int, default=1, help='the max size of ngram mask') parser.add_argument('--mask_whole_word', action='store_true', help="Whether masking a whole word.") parser.add_argument('--do_l2r_training', action='store_true', help="Whether to do left to right training") parser.add_argument( '--has_sentence_oracle', action='store_true', help="Whether to have sentence level oracle for training. " "Only useful for summary generation") parser.add_argument('--max_position_embeddings', type=int, default=None, help="max position embeddings") parser.add_argument('--relax_projection', action='store_true', help="Use different projection layers for tasks.") parser.add_argument('--ffn_type', default=0, type=int, help="0: default mlp; 1: W((Wx+b) elem_prod x);") parser.add_argument('--num_qkv', default=0, type=int, help="Number of different <Q,K,V>.") parser.add_argument('--seg_emb', action='store_true', help="Using segment embedding for self-attention.") parser.add_argument( '--s2s_special_token', action='store_true', help="New special tokens ([S2S_SEP]/[S2S_CLS]) of S2S.") parser.add_argument('--s2s_add_segment', action='store_true', help="Additional segmental for the encoder of S2S.") parser.add_argument( '--s2s_share_segment', action='store_true', help= "Sharing segment embeddings for the encoder of S2S (used with --s2s_add_segment)." ) parser.add_argument('--pos_shift', action='store_true', help="Using position shift for fine-tuning.") parser.add_argument( "--experiment", type=str, default="full", help= "1.full (title + full abstract) 2.title (only title), 3.title-l1 (title + l1), 4. single 5. segsep" ) args = parser.parse_args() assert Path( args.model_recover_path).exists(), "--model_recover_path doesn't exist" args.output_dir = args.output_dir.replace('[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) args.log_dir = args.log_dir.replace('[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.log_dir, exist_ok=True) json.dump(args.__dict__, open(os.path.join(args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs dist.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) if args.max_position_embeddings: tokenizer.max_len = args.max_position_embeddings data_tokenizer = WhitespaceTokenizer( ) if args.tokenized_input else tokenizer if args.local_rank == 0: dist.barrier() DatasetFunc = ConcatDataset processor = seq2seq_loader.Preprocess4Seq2seq if args.experiment == "title": DatasetFunc = TitleDataset elif args.experiment == "title-l1": DatasetFunc = TitleLead1Dataset elif args.experiment == "single": DatasetFunc = SingleTrainingDataset elif args.experiment == "title-first": DatasetFunc = TitleFirstDataset elif args.experiment == "segsep": DatasetFunc = SegSepDataset processor = Preprocess4SegSep if args.do_train: print("Loading Train Dataset", args.data_dir) bi_uni_pipeline = [ processor(args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_source_words=args.mask_source_words, skipgram_prb=args.skipgram_prb, skipgram_size=args.skipgram_size, mask_whole_word=args.mask_whole_word, mode="s2s", has_oracle=args.has_sentence_oracle, num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift) ] file_oracle = None if args.has_sentence_oracle: file_oracle = os.path.join(args.data_dir, 'train.oracle') fn_src = os.path.join(args.data_dir, args.src_file if args.src_file else 'train.src') fn_tgt = os.path.join(args.data_dir, args.tgt_file if args.tgt_file else 'train.tgt') train_dataset = DatasetFunc(fn_src, fn_tgt, args.train_batch_size, data_tokenizer, args.max_seq_length, args.max_len_b, bi_uni_pipeline=bi_uni_pipeline) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset, replacement=False) _batch_size = args.train_batch_size else: train_sampler = DistributedSampler(train_dataset) _batch_size = args.train_batch_size // dist.get_world_size() train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=_batch_size, sampler=train_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) # t_total = int(math.ceil(len(train_dataset.ex_list) / args.train_batch_size) t_total = int( len(train_dataloader) * args.num_train_epochs / args.gradient_accumulation_steps) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model recover_step = _get_max_epoch_model(args.output_dir) cls_num_labels = 2 type_vocab_size = 6 + \ (1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2 if args.experiment == "segsep": type_vocab_size = 11 # for the largest dataset only have 10 papers num_sentlvl_labels = 2 if args.has_sentence_oracle else 0 relax_projection = 4 if args.relax_projection else 0 if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() if (recover_step is None) and (args.model_recover_path is None): # if _state_dict == {}, the parameters are randomly initialized # if _state_dict == None, the parameters are initialized with bert-init _state_dict = {} if args.from_scratch else None model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=_state_dict, num_labels=cls_num_labels, num_rel=0, type_vocab_size=type_vocab_size, config_path=args.config_path, task_idx=3, num_sentlvl_labels=num_sentlvl_labels, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, relax_projection=relax_projection, new_pos_ids=args.new_pos_ids, ffn_type=args.ffn_type, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, num_qkv=args.num_qkv, seg_emb=args.seg_emb) global_step = 0 else: if recover_step: logger.info("***** Recover model: %d *****", recover_step) model_recover = torch.load(os.path.join( args.output_dir, "model.{0}.bin".format(recover_step)), map_location='cpu') # recover_step == number of epochs global_step = math.floor(recover_step * t_total / args.num_train_epochs) elif args.model_recover_path: logger.info("***** Recover model: %s *****", args.model_recover_path) model_recover = torch.load(args.model_recover_path, map_location='cpu') global_step = 0 model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, num_rel=0, type_vocab_size=type_vocab_size, config_path=args.config_path, task_idx=3, num_sentlvl_labels=num_sentlvl_labels, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, relax_projection=relax_projection, new_pos_ids=args.new_pos_ids, ffn_type=args.ffn_type, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, num_qkv=args.num_qkv, seg_emb=args.seg_emb) if args.local_rank == 0: dist.barrier() if args.fp16: model.half() if args.fp32_embedding: model.bert.embeddings.word_embeddings.float() model.bert.embeddings.position_embeddings.float() model.bert.embeddings.token_type_embeddings.float() model.to(device) if args.local_rank != -1: try: from torch.nn.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("DistributedDataParallel") model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelImbalance(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: # from apex.optimizers import FP16_Optimizer from pytorch_pretrained_bert.optimization_fp16 import FP16_Optimizer_State from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer_State(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer_State(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if recover_step: logger.info("***** Recover optimizer: %d *****", recover_step) optim_recover = torch.load(os.path.join( args.output_dir, "optim.{0}.bin".format(recover_step)), map_location='cpu') if hasattr(optim_recover, 'state_dict'): optim_recover = optim_recover.state_dict() optimizer.load_state_dict(optim_recover) if args.loss_scale == 0: logger.info("***** Recover optimizer: dynamic_loss_scale *****") optimizer.dynamic_loss_scale = True logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.do_train: logger.info("***** Running training *****") logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) model.train() if recover_step: start_epoch = recover_step + 1 else: start_epoch = 1 for i_epoch in trange(start_epoch, int(args.num_train_epochs) + 1, desc="Epoch", disable=args.local_rank not in (-1, 0)): if args.local_rank != -1: train_sampler.set_epoch(i_epoch) iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)', disable=args.local_rank not in (-1, 0)) for step, batch in enumerate(iter_bar): batch = [ t.to(device) if t is not None else None for t in batch ] if args.has_sentence_oracle: input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, oracle_pos, oracle_weights, oracle_labels = batch else: input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx = batch oracle_pos, oracle_weights, oracle_labels = None, None, None loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv) masked_lm_loss, next_sentence_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. # loss = loss.mean() masked_lm_loss = masked_lm_loss.mean() next_sentence_loss = next_sentence_loss.mean() loss = masked_lm_loss + next_sentence_loss # logging for each step (i.e., before normalization by args.gradient_accumulation_steps) iter_bar.set_description('Iter (loss=%5.3f)' % loss.item()) # ensure that accumlated gradients are normalized if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * \ warmup_linear(global_step/t_total, args.warmup_proportion) if args.fp16: # modify learning rate with special warm up BERT uses for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model if (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info( "** ** * Saving fine-tuned model and optimizer ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "model.{0}.bin".format(i_epoch)) torch.save(model_to_save.state_dict(), output_model_file) output_optim_file = os.path.join( args.output_dir, "optim.{0}.bin".format(i_epoch)) torch.save(optimizer.state_dict(), output_optim_file) logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache()
inp1 = torch.randn(num_workers, 3, resolution[0], resolution[1]).cuda() depth1 = torch.randn(num_workers, 1, resolution[0], resolution[1]).cuda() a_icm = torch.zeros(num_workers).long().cuda() for epoch in range(epochs): print("\nEpoch %d\n-------" % (epoch)) loss_value_total = 0.0 loss_policy_total = 0.0 loss_entropy_total = 0.0 loss_inverse_total = 0.0 loss_forward_total = 0.0 reward_intrinsic_total = 0.0 print("Training...") model.train() for learning_step in trange(sequences_per_epoch, leave=False): loss = 0.0 probs_list = [] log_probs_list = [] entropy_list = [] value_list = [] reward_list = [] unfinished_list = [] forward_start_time = time() for t in range(seq_len): inp, depth = prep_frames_batch(workers) (policy, value, hidden) = model(inp, hidden) probs = F.softmax(policy, 1) log_probs = F.log_softmax(policy, 1) a = probs.multinomial(num_samples=1).detach().squeeze(1) probs_list.append(probs[whole_batch, a])
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) device = torch.device("cuda:0") #if torch.cuda.is_available() else "cpu") train_loss_set = [] # Number of training epochs (authors recommend between 2 and 4) epochs = 4 # trange is a tqdm wrapper around the normal python range for _ in trange(epochs, desc="Epoch"): # Training # Set our model to training mode (as opposed to evaluation mode) model.train() # Tracking variables tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # Train the data for one epoch for step, batch in enumerate(train_dataloader): # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader
# target_image = target_image.permute(2, 0, 1).unsqueeze(0) optimizer = torch.optim.SGD(network.parameters(), lr=train_rate, momentum=0.5) # optimizer = torch.optim.Adam(network.parameters(), lr=train_rate) epochs = 100 validate_every = 10 if args.inference: torch.autograd.set_grad_enabled(False) epochs = 1 sim_duration = 25 sim_dt = (1.0 / 60.0) / sim_substeps sim_steps = int(sim_duration / sim_dt) for e in trange(epochs): sim_time = 0.0 state = model.state() loss = torch.zeros(1, requires_grad=True) # loss = None print_every = 60 * 16 render_every = 60 imgs = [] for i in range(0, sim_steps):
def start(worker, size, quality, folder, temp, preffix, suffix, upscale, downscale, copy_ud, cores): curr = "" d = open(f"{temp}/core{worker}.log", "w") d.close() d = open(f"{temp}/core{worker}.log", "a") try: images = get_data(worker, temp) bar = trange( len(images), leave=True, dynamic_ncols=True, ascii=True) #, bar_format= "{l_bar}{bar}|{n_fmt}/{total_fmt}" terminalsize = 100**10 for x in bar: if os.path.isfile("./stop.all"): break try: curr = images[x][1] img = Image.open(f"{folder}{images[x][0]}{images[x][2]}") if not os.path.isdir(folder + images[x][1]): os.makedirs(folder + images[x][1]) if (((img.size[0] > size or img.size[1] > size) and downscale) or ((img.size[0] < size and img.size[1] < size) and upscale)): img.resize( get_pos(img.size, size), resample=Image.BICUBIC ).save( f"{folder}{images[x][1]}{preffix}{images[x][3]}{suffix}{images[x][4]}", quality=quality, optimize=True) elif copy_ud: img.save( f"{folder}{images[x][1]}{preffix}{images[x][3]}{suffix}{images[x][4]}", quality=quality, optimize=True) img = None fa = open(f"{temp}/core{worker}.progress", "w") fa.write(str(x)) fa.close() if os.get_terminal_size( )[0] < terminalsize or bar.ncols + 1 > os.get_terminal_size( )[0]: terminalsize = os.get_terminal_size()[0] bar.refresh() os.system("cls") if (cores == 1): print("Starting..") print("Appling...") print("Ctrl+C to stop") elif os.get_terminal_size()[0] != terminalsize: terminalsize = os.get_terminal_size()[0] except Exception as E: d.write("\n" + re.sub( regex, subst, f'{datetime.now().strftime("%d.%m.%Y %H:%M:%S")} [Error] > "{images[x][0]}{images[x][2]}" > {E}' )) except Exception as E: d = open(f"{temp}/core{worker}.log", "a") d.write(f"\n{datetime.now()} [Error] > {curr} > {E}") d.close() f = open(f"{temp}/core{worker}.data", "w+") f.close() d.close() f = open(f"{temp}/core{worker}.data", "w+") f.close()
# ----------- # Main # # This is where we will mostly be spawning workers, initializing networks, and plotting our rewards. # ----------- if __name__ == "__main__": memory = Memory([], MEMORY_SIZE) model = QNetwork() epsilon = 1 all_epsilons, all_steps, all_rewards = [], [], [] env = gym.make("CartPole-v0") video_recorder = VideoRecorder(env, './output/00_Cartpole_Q_Learning_Discrete_Video.mp4', enabled=True) for i in trange(TOTAL_RUNTIME): total_reward = 0.0 total_steps = 0 state = env.reset()[2] while True: if (i+1) % 100 == 0: video_recorder.capture_frame() if i < STARTUP_SIZE: action = env.action_space.sample() else: if random.random() < epsilon: action = env.action_space.sample() else: action = np.argmax(model(torch.from_numpy(angle_to_vector(state, N_STATES)).to(device)).cpu().detach().numpy()) next_state, reward, done, _ = env.step(action) next_state = next_state[2]
def train(self): x_list, xs, ys, sample_list = self.batch_manager.random_list( self.b_num) save_image(xs, '{}/x_gt.png'.format(self.model_dir)) save_image(ys, '{}/y_gt.png'.format(self.model_dir)) with open('{}/gt.txt'.format(self.model_dir), 'w') as f: for sample in sample_list: f.write(sample + '\n') # call once summary_once = self.sess.run(self.summary_once) self.summary_writer.add_summary(summary_once, 0) self.summary_writer.flush() for step in trange(self.start_step, self.max_step): fetch_dict = { "optim": self.optim, "loss": self.loss, } if step % self.log_step == 0 or step == self.max_step - 1: fetch_dict.update({ "summary": self.summary_op, }) if step % self.test_step == self.test_step - 1 or step == self.max_step - 1: l1, l2, iou, nb = 0, 0, 0, 0 for x, y in self.batch_manager.test_batch(): if self.data_format == 'NCHW': x = to_nchw_numpy(x) y = to_nchw_numpy(y) tl1, tl2, y_ = self.sess.run( [self.tl1, self.tl2, self.yt_], { self.xt: x, self.yt: y }) l1 += tl1 l2 += tl2 nb += 1 # iou y_I = np.logical_and(y > 0, y_ > 0) y_I_sum = np.sum(y_I, axis=(1, 2, 3)) y_U = np.logical_or(y > 0, y_ > 0) y_U_sum = np.sum(y_U, axis=(1, 2, 3)) # print(y_I_sum, y_U_sum) nonzero_id = np.where(y_U_sum != 0)[0] if nonzero_id.shape[0] == 0: acc = 1.0 else: acc = np.average(y_I_sum[nonzero_id] / y_U_sum[nonzero_id]) iou += acc if nb > 500: break l1 /= float(nb) l2 /= float(nb) iou /= float(nb) summary_test = self.sess.run( self.summary_test, { self.test_acc_l1: l1, self.test_acc_l2: l2, self.test_acc_iou: iou }) self.summary_writer.add_summary(summary_test, step) self.summary_writer.flush() result = self.sess.run(fetch_dict) if step % self.log_step == 0 or step == self.max_step - 1: self.summary_writer.add_summary(result['summary'], step) self.summary_writer.flush() loss = result['loss'] assert not np.isnan(loss), 'Model diverged with loss = NaN' print("\n[{}/{}] Loss: {:.6f}".format(step, self.max_step, loss)) if step % (self.log_step * 10) == 0 or step == self.max_step - 1: self.generate(x_list, self.model_dir, idx=step) if step % self.lr_update_step == self.lr_update_step - 1: self.sess.run(self.lr_update) # save last checkpoint.. save_path = os.path.join(self.model_dir, 'model.ckpt') self.saver.save(self.sess, save_path, global_step=self.step) self.batch_manager.stop_thread()
def main(): parser = ArgumentParser() parser.add_argument('--train_corpus', type=str, default='./datasets/unlabel/device-service-train-merge.txt', required=False, help="sentence in each line.") parser.add_argument("--output_dir", type=str, default='./datasets/unlabel/device-service-rel', required=False) parser.add_argument("--bert_model", type=str, default='bert-base-uncased', required=False, choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased"]) parser.add_argument("--do_lower_case", default=True) parser.add_argument("--do_whole_word_mask", default=True, help="Whether to use whole word masking rather than per-WordPiece masking.") parser.add_argument("--num_workers", type=int, default=1, help="The number of workers to use to write the files") parser.add_argument("--epochs_to_generate", type=int, default=1, help="Number of epochs of data to pregenerate") parser.add_argument("--max_seq_len", type=int, default=100) parser.add_argument("--short_seq_prob", type=float, default=0.1, help="Probability of making a short sentence as a training example") parser.add_argument("--masked_lm_prob", type=float, default=0.20, help="Probability of masking each token for the LM task") parser.add_argument("--max_predictions_per_seq", type=int, default=25, help="Maximum number of tokens to mask in each sequence") args = parser.parse_args() with open('./datasets/tag_vocab.txt', 'r', encoding='utf-8') as fp: tag_vocab = fp.read().splitlines() tag_vocab = [l.strip() for l in tag_vocab] args.tag_vocab = tag_vocab tokenizer = SubTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab_list = list(tokenizer.vocab.keys()) nlp = spacy.load("en_core_web_sm") all_tags = set() all_rel = set() with DocumentDatabase() as docs: for file in glob.glob(args.train_corpus): with open(file, 'r', encoding='utf-8') as f: doc = [] # token tag = [] # pos tags head = [] # head index arc_label = [] # dependency relation domain_label = [] # domain label for line in tqdm(f, desc="Loading Dataset", unit=" lines"): line = line.strip().lower() if args.do_lower_case else line.strip() if len(line) == 1: if len(doc): docs.add_document(doc, tag, head, arc_label, domain_label) doc = [] tag = [] head = [] arc_label = [] domain_label = [] else: domain, line = line.split('***')[:2] nlp_doc = nlp(line) tokens = [t.text for t in nlp_doc] token_tags = [t.tag_ for t in nlp_doc] token_head = parse_tree(nlp_doc) token_dep_rel = [t.dep_ for t in nlp_doc] all_rel.update(token_dep_rel) tokens, token_tags, token_head_index, token_dep_rel, _ = tokenizer.subword_tokenize(tokens, token_tags, token_head, token_dep_rel) all_tags.update(token_tags) assert len(tokens) == len(token_tags) == len(token_head_index) == len(token_dep_rel) doc.append(tokens) tag.append(token_tags) head.append(token_head_index) arc_label.append(token_dep_rel) domain_label.append(domain) if doc: docs.add_document(doc, tag, head, arc_label, domain_label) # If the last doc didn't end on a newline, make sure it still gets added if len(docs) < 1: exit("ERROR: No document breaks were found in the input file!") if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) if args.num_workers > 1: writer_workers = Pool(min(args.num_workers, args.epochs_to_generate)) arguments = [(docs, vocab_list, args, idx) for idx in range(args.epochs_to_generate)] writer_workers.starmap(create_training_file, arguments) else: for epoch in trange(args.epochs_to_generate, desc="Epoch"): create_training_file(docs, vocab_list, args, epoch)