Exemple #1
0
def download(left, right, top, bottom, zoom, filename, maptype="default"):

    for x in trange(left, right + 1):
        for y in trange(top, bottom + 1):
            path = './tiles/%s/%i/%i/%i.png' % (filename, zoom, x, y)
            if not os.path.exists(path):
                _download(x, y, zoom,filename,maptype)
Exemple #2
0
def main():
    """Main program."""
    answer = 0

    start = time.time()
    max_period = 0
    for index in tqdm.trange(1, 1000):
        period = calculate_period_length(index)
        if period > max_period:
            max_period = period
            answer = index
    end = time.time()
    print("The answer is %d" % answer)
    print("%f seconds elapsed" % (end - start))

    start = time.time()
    max_period = 0
    for index in tqdm.trange(1, 1000):
        period = lambda_decimal_period(index)
        if period > max_period:
            max_period = period
            answer = index
    end = time.time()
    print("The answer is %d" % answer)
    print("%f seconds elapsed" % (end - start))

    import pyperclip
    pyperclip.copy(str(answer))
    print("The answer has been placed in the clipboard.")
Exemple #3
0
def trees_are_random(filename):
    res_file = filename.replace('_0', '_processed')
    with np.load(filename) as f:
        res, gold = f['res'], f['gold']
    num_trees, num_order, _ = res.shape
    all_trees = list(range(num_trees))
    all_order = list(range(num_order))
    all_pred = np.arange(len(gold))
    nrep = 100
    rate = np.zeros((num_trees//2+1, [j//13-1 for j in range(13, num_order, 13)][-1]+1, 2))
    frac = 1.3
    dropped = []
    for i in trange(1, num_trees+1, 2):
        for j in trange(13, num_order, 13):
            tmp_res = []
            for k in range(nrep):
                trees = random.sample(all_trees, i)
                orders = random.sample(all_order, j-1 if j % 2 == 0 else j)
                if i == 1:
                    vals = res[trees, orders, :]
                else:
                    vals = res[np.ix_(trees, orders, all_pred)].sum(0)
                tmp_res.append(mistakes(vals))
            thre = frac*np.median(tmp_res)
            good = tmp_res < thre
            bad = np.logical_not(good)
            dropped.append((i, j, bad.sum()/nrep))
            rate[(i-1)//2, j//13-1, :] = np.mean(tmp_res), np.std(tmp_res)
            np.savez_compressed(res_file, rate=rate)
Exemple #4
0
def main():
    bar = trange(60*25)
    bar.write("Working time...")
    for t in bar:
        time.sleep(1)
    bar = trange(60*5)
    bar.write("Break time...")
    for t in bar:
        time.sleep(1)
Exemple #5
0
def process_tilenum(left, right, top, bottom, zoom, output='output/mosaic.png'):
    """
    download and mosaic by tile number 
    """
    for x in trange(left, right + 1):
        for y in trange(top, bottom + 1):
            path = './tiles/%i/%i/%i.png' % (zoom, x, y)
            if not os.path.exists(path):
                _download(x, y, zoom)
    _mosaic(left, right, top, bottom, zoom, output)
Exemple #6
0
def test_trange():
    """ Test trange """
    with closing(StringIO()) as our_file:
        for _ in trange(3, file=our_file, leave=True):
            pass
        our_file.seek(0)
        assert '| 3/3 ' in our_file.read()

    with closing(StringIO()) as our_file2:
        for _ in trange(3, file=our_file2, leave=False):
            pass
        our_file2.seek(0)
        assert '| 3/3 ' not in our_file2.read()
def ensemble(validation_base_path, validation_folder, validation_predicted_folder):
    pkl_files = []
    weights = []
    #weight_file = './file_weight_128.csv'
    #weight_file = './file_weight_128_144models.csv'
    #weight_file = './file_weight_128_144_updated models.csv'
    #weight_file = './file_weight_128_144_3rd version models.csv'
    weight_file = './10 best models weights for task 1.csv'
    #weight_file = './5 best models weights for task 1.csv'
    #weight_file = './20 best models weights for task 1.csv'

    with open(weight_file, 'rb') as f:
        rows = csv.reader(f, delimiter=',')
        #next(rows, None)
        for row in rows:
            if '.pkl' in row[0]:
                pkl_files.append(validation_base_path + row[0])
            else:
                pkl_files.append(validation_base_path + row[0] + '.pkl')
            weights.append(float(row[1]))

    print (len(pkl_files))
    print weights
    print np.sum(weights)

    mask_pred_challenge_list = []
    for i in trange(len(pkl_files)):
        mask_pred_challenge = pkl.load(open(pkl_files[i], 'rb'))
        mask_pred_challenge_list.append(mask_pred_challenge)
    mask_pred_challenge_list = np.array(mask_pred_challenge_list)
    print mask_pred_challenge_list.shape
    weights = np.array(weights)

    mask_pred_challenge = np.dot(mask_pred_challenge_list.transpose(1,2,3,0), weights)
    print mask_pred_challenge.shape

    if not os.path.exists(validation_predicted_folder):
        os.makedirs(validation_predicted_folder)

    cutoff = 0.5
    mask_pred_challenge_b = (np.where(mask_pred_challenge>=cutoff, 1, 0) * 255).astype(np.uint8)
    challenge_list = ISIC.list_from_folder(validation_folder)

    for i in trange(len(challenge_list)):
        _, _ = ISIC.show_images_full_sized(challenge_list,
                                           img_mask_pred_array=mask_pred_challenge_b,
                                           image_folder=validation_folder,
                                           mask_folder=None,
                                           index=i,
                                           output_folder=validation_predicted_folder,
                                           plot=False)
Exemple #8
0
def test_trange():
    our_file = StringIO()
    for i in trange(3, file=our_file, leave=True):
        pass
    our_file.seek(0)
    assert '| 3/3 ' in our_file.read()
    our_file.close()

    our_file2 = StringIO()
    for i in trange(3, file=our_file2, leave=False):
        pass
    our_file2.seek(0)
    assert '| 3/3 ' not in our_file2.read()
    our_file2.close()
Exemple #9
0
def _mosaic(left, right, top, bottom, zoom, output='output/mosaic.png'):
    size_x = (right - left + 1) * 256
    size_y = (bottom - top + 1) * 256
    output_im = Image.new("RGBA", (size_x, size_y))

    for x in trange(left, right + 1):
        for y in trange(top, bottom + 1):
            path = './tiles/%i/%i/%i.png' % (zoom, x, y)
            target_im = Image.open(path)
            output_im.paste(target_im, (256 * (x - left), 256 * (y - top)))
    output_path = os.path.split(output)
    if len(output_path) > 1 and len(output_path) != 0:
        if not os.path.isdir(output_path[0]):
            os.makedirs(output_path[0])
    output_im.save(output)
Exemple #10
0
def move_and_snap(m, s, fname, zenith = 0, destination = 0, acc_len = 1, n_accs = 10, dt = 0):
    '''
    Function that gets called over a range of airmasses in go(). Moves to the destination, 
    takes a snapshot, calculates true time based on offset, queries the motor position, then 
    calls io.write_to_hdf5 on the hdf5 filename.

    Inputs:
        Required - Motor, Spec, and  hdf5 filename.
        Optional - zenith angle wrt. 0 on the motor (degs), destination wrt. zenith (degs), 
                   accumulation length in secs, step size in degrees, number of accumulations, 
                   computer's offset from true utc time in secs.
    Outputs:
        None, writes to disk. 
    '''
    #print('Moving to {} deg ZA'.format(destination))
    m.abst(destination + zenith)
    #print('Integrating')
    for i in tqdm.trange(n_accs, unit='accs'):
        spec = s.snap_spec()
        utc = ts.true_time(dt)
        pos = m.position()
        mjd = ts.iso_to_mjd(utc)
        io.write_to_hdf5(fname, spec, {
            'angle_degs': pos,
            'utc': utc,
            'mjd': mjd,
            'samp_rate_mhz': s.samp_rate,
            'acc_len_secs': s.acc_len,
            'zenith_degs': zenith
        })
Exemple #11
0
def main():
    """Main program."""
    answer = 0

    start_time = time.time()

    # Find next sequence after 1487, 4817, 8147
    for number_1 in tqdm.trange(1488, 9998):
        for index in range(1, (9999 - number_1) / 2):
            number_2 = number_1 + index
            number_3 = number_1 + (2 * index)
            if all([sorted(str(n)) == sorted(str(number_1)) \
                    for n in [number_2, number_3]]) \
            and all([sympy.ntheory.primetest.isprime(n) \
                     for n in [number_1, number_2, number_3]]):
                answer = int(str(number_1) + str(number_2) + str(number_3))
                break

        if answer > 0:
            break

    end_time = time.time()
    print("The answer is %d" % answer)
    print("%f seconds elapsed." % (end_time - start_time))

    import pyperclip
    pyperclip.copy(str(answer))
    print("The answer has been placed in the clipboard.")
Exemple #12
0
def simulate_system(bundle, reps=10, check_laplacian=True):
    """ Generate data from system setup
    """
    # lonely investigation :'(
    if check_laplacian:
        investigate_laplacian(bundle.graph)

    # solve system on network
    corr_mats = []
    var_sers = []
    all_sols = []
    for _ in trange(reps):
        sols, ts = solve_system(bundle.system_config)

        cmat = compute_correlation_matrix(sols)
        vser = compute_cluster_num(sols, len(bundle.graph.nodes()))

        corr_mats.append(cmat)
        var_sers.append(vser)
        all_sols.append(sols)

    bundle['all_sols'] = all_sols
    bundle['corr_mats'] = np.array(corr_mats)
    bundle['var_sers'] = np.array(var_sers)
    bundle['ts'] = ts

    return bundle
Exemple #13
0
def topography(df):
    xlim = int(np.floor(df.xcen.max()) + 1)
    ylim = int(np.floor(df.ycen.max()) + 1)

    # Take z in reverse order, so that when we iterate through them, the
    # lowest z value (the top of the topography) will come up first.
    print("Computing topography", end='\r')
    sys.stdout.flush()
    df_iter = df.sort(['ycen', 'xcen', 'zcen']).itertuples()

    xcen = df.columns.get_loc("xcen") + 1
    ycen = df.columns.get_loc("ycen") + 1
    zcen = df.columns.get_loc("zcen") + 1

    row = None
    topo = np.zeros((ylim, xlim), dtype=np.int32)
    for y in trange(ylim, desc="Computing topography", leave=True):
        for x in range(xlim):
            def same_pixel(row):
                """Use tuple ordering to determine whether we're ahead
                or behind the dataframe."""
                image_index = (y,x)
                df_index = (row[ycen], row[xcen])
                return df_index < image_index

            # Drop until x,y coordinates match.  Since this is sorted by
            # x,y,z, the next pixel will be the lowest z.
            row = dropwhile(same_pixel, df_iter, row)
            if row[xcen] == x and row[ycen] == y:
                topo[y,x] = row[zcen]
            else:
                raise RuntimeError("No data at coordinate x={},y={}"
                        .format(x,y))

    return topo
Exemple #14
0
    def get_training_bbox(bbox_dir, imglist):
        import xml.etree.ElementTree as ET
        ret = []

        def parse_bbox(fname):
            root = ET.parse(fname).getroot()
            size = root.find('size').getchildren()
            size = map(int, [size[0].text, size[1].text])

            box = root.find('object').find('bndbox').getchildren()
            box = map(lambda x: float(x.text), box)
            return np.asarray(box, dtype='float32')

        with timed_operation('Loading Bounding Boxes ...'):
            cnt = 0
            for k in tqdm.trange(len(imglist)):
                fname = imglist[k][0]
                fname = fname[:-4] + 'xml'
                fname = os.path.join(bbox_dir, fname)
                try:
                    ret.append(parse_bbox(fname))
                    cnt += 1
                except Exception:
                    ret.append(None)
            logger.info("{}/{} images have bounding box.".format(cnt, len(imglist)))
        return ret
def test_iter_overhead_hard():
    """Test overhead of iteration based tqdm (hard)"""

    total = int(1e5)

    with closing(MockIO()) as our_file:
        a = 0
        with trange(total, file=our_file, leave=True, miniters=1,
                    mininterval=0, maxinterval=0) as t:
            with relative_timer() as time_tqdm:
                for i in t:
                    a += i
        assert a == (total * total - total) / 2.0

        a = 0
        with relative_timer() as time_bench:
            for i in _range(total):
                a += i
                our_file.write(("%i" % a) * 40)

    # Compute relative overhead of tqdm against native range()
    try:
        assert time_tqdm() < 60 * time_bench()
    except AssertionError:
        raise AssertionError('trange(%g): %f, range(%g): %f' %
                             (total, time_tqdm(), total, time_bench()))
Exemple #16
0
def main():
    """Main program."""
    answer = 0
    start_time = time.time()
    denominator = Fraction(1, 1)
    for index in tqdm.trange(1000):
        if index == 0:
            denominator = Fraction(2, 1)
        elif index == 1:
            denominator = 2 + Fraction(1, 2)
        else:
            denominator = 2 + Fraction(1, denominator)

        continual_fraction = 1 + Fraction(1, denominator)
        numerator_digits = len(str(continual_fraction.numerator))
        denominator_digits = len(str(continual_fraction.denominator))
        if numerator_digits > denominator_digits:
            answer += 1
    end_time = time.time()
    print("The answer is %d" % answer)
    print("%f seconds elapsed." % (end_time - start_time))

    import pyperclip
    pyperclip.copy(str(answer))
    print("The answer has been placed in the clipboard.")
def test_iter_overhead_simplebar_hard():
    """Test overhead of iteration based tqdm vs simple progress bar (hard)"""

    total = int(1e4)

    with closing(MockIO()) as our_file:
        a = 0
        with trange(total, file=our_file, leave=True, miniters=1,
                    mininterval=0, maxinterval=0) as t:
            with relative_timer() as time_tqdm:
                for i in t:
                    a += i
        assert a == (total * total - total) / 2.0

        a = 0
        s = simple_progress(_range(total), file=our_file, leave=True,
                            miniters=1, mininterval=0)
        with relative_timer() as time_bench:
            for i in s:
                a += i

    # Compute relative overhead of tqdm against native range()
    try:
        assert time_tqdm() < 2.5 * time_bench()
    except AssertionError:
        raise AssertionError('trange(%g): %f, simple_progress(%g): %f' %
                             (total, time_tqdm(), total, time_bench()))
def featurize_time_series_submission(submission_df, features, structure):
    print("Featurizing time series")
    print(features.shape[1])
    assignments = structure['ASS_ASSIGNMENT']
    days = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche']

    ass_dfs = {}
    for day in days:
        for ass in assignments:
            ass_dfs[day + '_' + ass] = pd.read_pickle('files/split/' + day + '_' + ass + '.pkl')

    new_features = np.full((len(submission_df), _n_features), np.nan)

    submission_df = submission_df.set_index(['DAY_WE_DS', 'DATE', 'ASS_ASSIGNMENT'], drop=False)
    for i in trange(0, submission_df.shape[0]):
        (day, datetime, ass) = submission_df.index[i]
        df = ass_dfs[day + '_' + ass][(ass_dfs[day + '_' + ass].DATE.dt.hour == datetime.hour) &
                                      (ass_dfs[day + '_' + ass].DATE.dt.minute == datetime.minute)]
        df = df[df.DATE < datetime - DateOffset(days=3)]
        old_values = df.tail(_n_features)['CSPL_RECEIVED_CALLS'].as_matrix()

        # print(old_values)
        for j in range(len(old_values)):
            new_features[i, j] = old_values[j]

    for j in range(_n_features):
        features['prev_value_' + str(j)] = new_features[:, j]

    return features
  def _maybe_generate_and_save(self, except_list=[]):
    self.data = {}

    for name, num in self.data_num.items():
      if name in except_list:
        tf.logging.info("Skip creating {} because of given except_list {}".format(name, except_list))
        continue
      path = self.get_path(name)

      if not os.path.exists(path):
        tf.logging.info("Creating {} for [{}]".format(path, self.task))

        x = np.zeros([num, self.max_length, 2], dtype=np.float32)
        y = np.zeros([num, self.max_length], dtype=np.int32)

        for idx in trange(num, desc="Create {} data".format(name)):
          n_nodes = self.rng.randint(self.min_length, self.max_length+ 1)
          nodes, res = generate_one_example(n_nodes, self.rng)
          x[idx,:len(nodes)] = nodes
          y[idx,:len(res)] = res

        np.savez(path, x=x, y=y)
        self.data[name] = TSP(x=x, y=y, name=name)
      else:
        tf.logging.info("Skip creating {} for [{}]".format(path, self.task))
        tmp = np.load(path)
        self.data[name] = TSP(x=tmp['x'], y=tmp['y'], name=name)
Exemple #20
0
    def spaceConvNumba2(self):
        """ Exactly the same as the former method, just contains a 
        nested function so the dot product appears more obvious """ 

        @checkarrays
        @jit
        def dotJit(subarray, kernel):
            """ perform a simple 'dot product' between the 2 dimensional 
            image subsets. 
            """
            total = 0.0
            # This is the O(n^2) part of the algorithm
            for i in xrange(subarray.shape[0]):
                for j in xrange(subarray.shape[1]):
                    total += subarray[i][j] * kernel[i][j]
            return total
     
        # this is the O(N^2) part of the algorithm
        for i in trange(self.__rangeX_):
            for j in xrange(self.__rangeY_):
                # dotJit is located outside the class :P
                self.__arr_[i, j] = dotJit(\
                    self.array[i:i+self.__rangeKX_,
                               j:j+self.__rangeKY_]
                    , self.kernel
                )
     
        return self.__arr_
Exemple #21
0
def test_iter_overhead():
    """ Test overhead of iteration based tqdm """
    try:
        assert checkCpuTime()
    except:
        raise SkipTest

    total = int(1e6)

    with closing(MockIO()) as our_file:
        a = 0
        with relative_timer() as time_tqdm:
            for i in trange(total, file=our_file):
                a += i
        assert(a == (total * total - total) / 2.0)

        a = 0
        with relative_timer() as time_bench:
            for i in _range(total):
                a += i
                our_file.write(a)

    # Compute relative overhead of tqdm against native range()
    if time_tqdm() > 9 * time_bench():
        raise AssertionError('trange(%g): %f, range(%g): %f' %
                             (total, time_tqdm(), total, time_bench()))
Exemple #22
0
def send_dataflow_zmq(df, addr, hwm=50, format=None, bind=False):
    """
    Run DataFlow and send data to a ZMQ socket addr.
    It will serialize and send each datapoint to this address with a PUSH socket.
    This function never returns.

    Args:
        df (DataFlow): Will infinitely loop over the DataFlow.
        addr: a ZMQ socket endpoint.
        hwm (int): ZMQ high-water mark (buffer size)
        format (str): The serialization format.
             Default format uses :mod:`tensorpack.utils.serialize`.
             This format works with :class:`dataflow.RemoteDataZMQ`.
             An alternate format is 'zmq_ops', used by https://github.com/tensorpack/zmq_ops
             and :class:`input_source.ZMQInput`.
        bind (bool): whether to bind or connect to the endpoint address.
    """
    assert format in [None, 'zmq_op', 'zmq_ops']
    if format is None:
        dump_fn = dumps
    else:
        from zmq_ops import dump_arrays
        dump_fn = dump_arrays

    ctx = zmq.Context()
    socket = ctx.socket(zmq.PUSH)
    socket.set_hwm(hwm)
    if bind:
        socket.bind(addr)
    else:
        socket.connect(addr)
    try:
        df.reset_state()
        logger.info("Serving data to {} with {} format ...".format(
            addr, 'default' if format is None else 'zmq_ops'))
        INTERVAL = 200
        q = deque(maxlen=INTERVAL)

        try:
            total = df.size()
        except NotImplementedError:
            total = 0
        tqdm_args = get_tqdm_kwargs(leave=True, smoothing=0.8)
        tqdm_args['bar_format'] = tqdm_args['bar_format'] + "{postfix}"
        while True:
            with tqdm.trange(total, **tqdm_args) as pbar:
                for dp in df.get_data():
                    start = time.time()
                    socket.send(dump_fn(dp), copy=False)
                    q.append(time.time() - start)
                    pbar.update(1)
                    if pbar.n % INTERVAL == 0:
                        avg = "{:.3f}".format(sum(q) / len(q))
                        pbar.set_postfix({'AvgSendLat': avg})
    finally:
        logger.info("Exiting send_dataflow_zmq ...")
        socket.setsockopt(zmq.LINGER, 0)
        socket.close()
        if not ctx.closed:
            ctx.destroy(0)
 def train(self):
     max_epoch = int(math.ceil(1. * self.max_iter / len(self.train_loader))) # 117
     for epoch in tqdm.trange(self.epoch, max_epoch, desc='Train', ncols=80):
         self.epoch = epoch
         self.train_epoch()
         if self.iteration >= self.max_iter:
             break
def moran_process(N=1000,turns=10000,mean_site_muts=1,mean_rec_muts=1,init=sample_species,mutate=mutate,
                  fitness=fitness,pop=None,print_modulus=100,hist_modulus=10):
    #ringer = (np.array([1]+[0]*(K-1)),sample_eps())
    if pop is None:
        pop = [(lambda spec:(spec,fitness(spec)))(init())
               for _ in trange(N)]
    # ringer = make_ringer()
    # pop[0] = (ringer,fitness(ringer))
    #pop = [(ringer,fitness(ringer)) for _ in xrange(N)]
    site_mu = min(1/float(n*L) * mean_site_muts,1)
    rec_mu = min(1/float(K) * mean_rec_muts,1)
    hist = []
    for turn in xrange(turns):
        fits = [f for (s,f) in pop]
        #print fits
        birth_idx = inverse_cdf_sample(range(N),fits,normalized=False)
        if birth_idx is None:
            return pop
        death_idx = random.randrange(N)
        #print birth_idx,death_idx
        mother,f = pop[birth_idx]
        daughter = mutate(mother,site_mu,rec_mu)
        #print "mutated"
        pop[death_idx] = (daughter,fitness(daughter))
        mean_fits = mean(fits)
        #hist.append((f,mean_fits))
        if turn % hist_modulus == 0:
            mean_dna_ic = mean([motif_ic(sites,correct=False) for ((sites,eps),_) in pop])
            mean_rec = mean([recognizer_promiscuity(x) for (x,f) in pop])
            mean_recced = mean([sites_recognized((dna,rec)) for ((dna,rec),_) in pop])
            hist.append((turn,f,mean_fits,mean_dna_ic,mean_rec,mean_recced))
            if turn % print_modulus == 0:
                print turn,"sel_fit:",f,"mean_fit:",mean_fits,"mean_dna_ic:",mean_dna_ic,"mean_rec_prom:",mean_rec
    return pop,hist
Exemple #25
0
    def do(syst, ax):
        # data
        single_run_matrices = []
        for _ in trange(reps):
            sol = solve_system(syst)

            sol_extract = sol.T[int(len(sol.T)*3/4):]
            single_run_mat = compute_correlation_matrix(np.array([sol_extract]))

            if single_run_mat.shape == (4, 4):
                single_run_mat = single_run_mat[:-1,:-1]
            assert single_run_mat.shape == (3, 3)

            single_run_matrices.append(single_run_mat)
        single_run_matrices = np.asarray(single_run_matrices)

        # plotting
        cols = cycle(['b', 'r', 'g', 'c', 'm', 'y', 'k'])
        for i, row in enumerate(single_run_matrices.T):
            for j, series in enumerate(row):
                if i == j: break
                plot_histogram(
                    series[series!=1], ax,
                    label=r'$c_{{{},{}}}$'.format(i,j),
                    facecolor=next(cols), alpha=0.5,
                    bins=100)
Exemple #26
0
    def __init__(self, cache=None, **kwargs):
        super(GTZAN, self).__init__(**kwargs)
        if kwargs.get('conf') is not None:
            conf = kwargs['conf']
            cache = conf.get('cache', None)
        data_set_path = osp.join(DEFAULT_IMAGEST_BASE, self.data_set)
        self.data_set_path = data_set_path
        self.cache = cache
        X, y = parse_anno_file(data_set_path)
        if cache == 'raw':
            import librosa
            from tqdm import trange
            X_new = np.zeros((len(X), 1, 661500, 1))
            for i in trange(len(X)):
                x,_ = librosa.load(osp.join(DEFAULT_DATA_BASE, X[i]))
                x_len = min(661500, len(x))
                X_new[i,:,:x_len,0] = x[:x_len]
        if cache is not None and cache != 'raw':
            X = self.load_cache_X(X, cache)
            if cache == 'mfcc':
                X_new = np.zeros((len(X), X[0].shape[0], 1280, 1))
                for i, x in enumerate(X):
                    x_len = min(x.shape[1], 1280)
                    X_new[i,:,:x_len,0] = x[:,:x_len]
                X = X_new

        # layout_X
        if self.layout_x == 'rel_path':
            self.X = X
        else:
            self.X = self.init_layout_X(X)
        # layout_y
        self.y = self.init_layout_y(y)
Exemple #27
0
def stress(minutes):
    """Perform a CPU and memory stress test for the given `minutes`.

    The CPU stress test uses one thread per core, and the RAM stress test one
    thread per core, totalling all main memory available to user processes.

    Return a boolean indicating whether the stress test was successful.
    """
    with open('/proc/cpuinfo') as cpuinfo:
        ncores = len(re.findall(r'^processor\b', cpuinfo.read(), re.M))
    with open('/proc/meminfo') as meminfo:
        match = re.search(r'^MemAvailable:\s*([0-9]+) kB.*', meminfo.read(), re.M)
        mem_kib = int(match.group(1))
    # Exclude a percentage of available memory for the stress processes themselves.
    mem_worker_kib = (mem_kib / ncores) * 90 / 100
    proc = subprocess.Popen([
        "stress",
        "-c", str(ncores),
        "-m", str(ncores),
        "--vm-bytes", "%dK" % mem_worker_kib,
        "-t", "%dm" % minutes],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
    for _ in tqdm.trange(minutes * 60):  # update progress bar every second
        time.sleep(1)
    proc.communicate()  # wait for process, consume output
    return proc.returncode == 0
def launch_experiments(ag_results, mode, nbNodes, nbRuns, proba_edge):
    folderName = str(nb)+mode
    os.makedirs(folderName, exist_ok=True)
    os.chdir(folderName)
    print("Experiment in mode ", mode, " with ", nbNodes, " nodes ", end='')
    if nbRuns > 0:
        print("with ", nbRuns, " runs")
        for i in trange(nbRuns):
            subprocess.run(os.path.join(os.path.dirname(sys.path[0]), programPath) + " " + mode + " " + str(nbNodes) + " " + str(proba_edge) + " 2>> errors.txt",  stdout=subprocess.DEVNULL, shell=True)
            #subprocess.run(programPath + " " + mode + " " + str(nbNodes), check=True, stdout=subprocess.DEVNULL, shell=True)
    else:
        print("without runs: reusing results from previous invocation")

    # TODO: rather do that as a 3rd phase? So that it's possible to relaunch experiments and have them included
    results=[]
    files= glob.glob("complex_graph*.csv")
    print("Collating results")
    for f in tqdm(files):
        #data = np.genfromtxt(f, delimiter="\t", encoding=None, dtype=[('Quality', '<i8'), ('Budget', '<i8'), ('ExpectRemainingTime', '<i8'), ('Deadline', '<i8'), ('NbNodes', '<i8'), ('ExecutionTime', '<i8'), ('ChoosingDuration', '<i8'), ('CallbackFlags', 'S16')], names=True)
        data = np.genfromtxt(f, delimiter="\t", encoding=None, names=True, skip_header=1, dtype=[('Quality', '<f8'), ('Budget', '<f8'), ('ExpectRemainingTime', '<f8'), ('Deadline', '<f8'), ('NbDegradedNodes', '<f8'), ('NbResamplers', '<f8'),
            ('ExecutionTime', '<f8'), ('ChoosingDuration', '<f8'), ('CallbackFlags', '<U7')])
        #data = np.genfromtxt(f, delimiter="\t", encoding=None, dtype=None, names=True, skip_header=1)
        nbActualNodes=-1
        nbActualEdges=-1
        with open(f, "r") as datafile:
            line1 = datafile.readline().split(' ')
            nbActualNodes = int(line1[0]) #Always equal to nbNodes byconstruction of the random graph
            nbActualEdges = int(line1[1])
        nbCycles = data.size #data should be 1D (each element is a dictionary)
        degraded = nbCycles -np.count_nonzero(data["Quality"])
        ag_results[(nbNodes, mode)].append((data, nbActualEdges, nbCycles, degraded))
    os.chdir("..")
Exemple #29
0
 def adev_at_tau_wrapper(idxs):
   if idxs[0] == 0:
     for i in trange(len(idxs)):
       adev_at_tau(idxs[i])
   else:
     for i in idxs:
       adev_at_tau(i)
 def loop(model: Layer,
          images: List[Tensor],
          labels: List[Tensor],
          loss: Loss,
          optimizer: Optimizer = None) -> None:
     correct = 0         # Track number of correct predictions.
     total_loss = 0.0    # Track total loss.
 
     with tqdm.trange(len(images)) as t:
         for i in t:
             predicted = model.forward(images[i])             # Predict.
             if argmax(predicted) == argmax(labels[i]):       # Check for
                 correct += 1                                 # correctness.
             total_loss += loss.loss(predicted, labels[i])    # Compute loss.
 
             # If we're training, backpropagate gradient and update weights.
             if optimizer is not None:
                 gradient = loss.gradient(predicted, labels[i])
                 model.backward(gradient)
                 optimizer.step(model)
 
             # And update our metrics in the progress bar.
             avg_loss = total_loss / (i + 1)
             acc = correct / (i + 1)
             t.set_description(f"mnist loss: {avg_loss:.3f} acc: {acc:.3f}")
Exemple #31
0
def main():
    file_dir = "raw_data/by_class"

    train_data = {'users': [], 'user_data': {}, 'num_samples': []}
    test_data = {'users': [], 'user_data': {}, 'num_samples': []}

    train_path = "train/mytrain.json"
    test_path = "test/mytest.json"

    X = [[] for _ in range(NUM_USER)]
    y = [[] for _ in range(NUM_USER)]

    nist_data = {}

    for class_ in os.listdir(file_dir):

        real_class = relabel_class(class_)

        if real_class >= 36 and real_class <= 61:

            full_img_path = file_dir + "/" + class_ + "/train_" + class_
            all_files_this_class = os.listdir(full_img_path)
            random.shuffle(all_files_this_class)
            sampled_files_this_class = all_files_this_class[:7000]
            imgs = []
            for img in sampled_files_this_class:
                imgs.append(load_image(full_img_path + "/" + img))
            class_ = relabel_class(class_)
            print(class_)
            nist_data[class_ - 36] = imgs  # a list of list, key is (0, 25)
            print(len(imgs))

    # assign samples to users by power law
    num_samples = np.random.lognormal(4, 2, (NUM_USER)) + 5

    idx = np.zeros(26, dtype=np.int64)

    for user in range(NUM_USER):
        num_sample_per_class = int(num_samples[user] / CLASS_PER_USER)
        if num_sample_per_class < 2:
            num_sample_per_class = 2

        for j in range(CLASS_PER_USER):
            class_id = (user + j) % 26
            if idx[class_id] + num_sample_per_class < len(nist_data[class_id]):
                idx[class_id] = 0
            X[user] += nist_data[class_id][idx[class_id]:(
                idx[class_id] + num_sample_per_class)]
            y[user] += (class_id * np.ones(num_sample_per_class)).tolist()
            idx[class_id] += num_sample_per_class

    # Create data structure
    train_data = {'users': [], 'user_data': {}, 'num_samples': []}
    test_data = {'users': [], 'user_data': {}, 'num_samples': []}

    for i in trange(NUM_USER, ncols=120):
        uname = 'f_{0:05d}'.format(i)

        combined = list(zip(X[i], y[i]))
        random.shuffle(combined)
        X[i][:], y[i][:] = zip(*combined)
        num_samples = len(X[i])
        train_len = int(0.9 * num_samples)
        test_len = num_samples - train_len

        train_data['users'].append(uname)
        train_data['user_data'][uname] = {
            'x': X[i][:train_len],
            'y': y[i][:train_len]
        }
        train_data['num_samples'].append(train_len)
        test_data['users'].append(uname)
        test_data['user_data'][uname] = {
            'x': X[i][train_len:],
            'y': y[i][train_len:]
        }
        test_data['num_samples'].append(test_len)

    with open(train_path, 'w') as outfile:
        json.dump(train_data, outfile)
    with open(test_path, 'w') as outfile:
        json.dump(test_data, outfile)
Exemple #32
0
    datetime.timedelta(hours=4),
    datetime.timedelta(days=1)
]

alltime_dfs = {}
for ashi, tmd in zip(ashis, tmds):
    df = pd.read_csv(rh_root + "/alltime/market_" + ashi + ".csv",
                     parse_dates=True)
    df["openTime"] = pd.to_datetime(df.openTime)
    df["closeTime"] = df.openTime.shift(-1)
    df.closeTime[len(df) - 1] = df.openTime[len(df) - 1] + tmd
    alltime_dfs[ashi] = df

df = alltime_dfs["h01"]

for weeki in trange(104, 180):
    kireme_d = datetime.datetime(year=2018, month=1,
                                 day=7) + datetime.timedelta(days=7) * weeki
    if kireme_d >= datetime.datetime(year=2020, month=12, day=29): break
    owbf_d = kireme_d - datetime.timedelta(days=7)

    dfs = {}
    for ashi in ashis:
        df = alltime_dfs[ashi]
        kireme_x = len(df[df.openTime <= kireme_d])
        owbf_x = len(df[df.openTime <= owbf_d])
        df = df[max(owbf_x - yoyuu, 0):kireme_x]
        df = df.reset_index(drop=True)

        df["openX"] = df.index
        df["closeX"] = df.openX + 1
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=4,
                                           shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=25)

optimizer = torch.optim.Adam(model.parameters(), lr=1.e-3)

loss_function = torch.nn.NLLLoss()
loss_function.to(device)

tb_logger = torch.utils.tensorboard.SummaryWriter('runs/log_mlp')
# %tensorboard --logdir runs

# train for a couple of epochs
n_epochs = 4
for epoch in trange(n_epochs):
    utils.train(model,
                train_loader,
                loss_function,
                optimizer,
                device,
                epoch,
                tb_logger=tb_logger)
    step = (epoch + 1) * len(train_loader)
    utils.validate(model,
                   val_loader,
                   loss_function,
                   device,
                   step,
                   tb_logger=tb_logger)
Exemple #34
0
def PMF(X: np.ndarray,
        d: int,
        l: float,
        s: float = 1,
        max_iter: int = 100,
        print_cost: int = 0,
        pretrained_u: np.ndarray = None,
        pretrained_v: np.ndarray = None) -> tuple:
    """
    Probabilistic Matrix Factorization
    algorithm implementation
    :param X: an observed matrix, dim(X) = n x m
    :param l: regularization parameter (lambda)
    :param s: regularization parameter (standard deviation)
    :param d: number of latent features
    :param max_iter: maximum iterations of the algorithm, default 100
    :param print_cost: if > 0 prints cost function every n-th iteration of the algorithm,
                       if = 0 does not print the cost at all
    :param pretrained_u: if you want to run more epochs
                       with pretrained matrices,
                       please, specify both; default None
    :param pretrained_v: see param pretrained_u
    :return: U, V - latent features matrices
             with dim(U) = d x n, dim(V) = d X m
    """

    n, m = X.shape
    ind = np.ones(X.shape)
    ind[X == 0] = 0  # indicator matrix

    # initialize latent features matrices
    if pretrained_u is None and pretrained_v is None:
        v = np.random.normal(0, 1 / l, (d, m))
        u = np.zeros((d, n))
    else:
        v = pretrained_v
        u = pretrained_u

    # suggestion from the lecturer:
    Omega_u = [list(np.where(X[i, :] > 0)[0]) for i in range(n)
               ]  # Omega_u[i] - масив індексів, для яких M_ij - observed
    Omega_v = [list(np.where(X[:, j] > 0)[0]) for j in range(m)]

    # iterate through u,v
    for k in range(max_iter):
        # calculate U
        for i in trange(n):
            u[:, i] = ((1 / (l * s**2 + np.array([
                ind[i, j] * np.linalg.norm(v[:, j].reshape(d, 1), ord='fro')**2
                for j in Omega_u[i]
            ]).sum()) * (v @ (ind * X)[i, :].reshape((1, m)).T)).reshape(
                (d, )))

        # calculate V
        for j in trange(m):
            v[:, j] = ((1 / (l * s**2 + np.array([
                ind[i, j] * np.linalg.norm(u[:, i].reshape(d, 1), ord='fro')**2
                for i in Omega_v[j]
            ]).sum()) * (u @ (ind * X)[:, j].reshape((1, n)).T)).reshape(
                (d, )))

        # save this for later
        # np.save('/home/olga/Projects/HW_ML/lab4/experiment/u.npy', u)
        # np.save('/home/olga/Projects/HW_ML/lab4/experiment/v.npy', v)

        # compute cost
        cost = ((s**(-2)) * np.linalg.norm(ind * (X - u.T @ v), ord='fro')**2 +
                l * np.linalg.norm(u, ord='fro')**2 +
                l * np.linalg.norm(v, ord='fro')**2) / 2
        if print_cost and (k + 1) % print_cost == 0:
            print(f"{k+1} iteration cost: {np.log(cost):.5f}")

    return u, v
def main():
    parser = get_argument_parser()

    deepspeed.init_distributed(dist_backend='nccl')
    args.local_rank = int(os.environ['LOCAL_RANK'])

    # Include DeepSpeed configuration arguments
    parser = deepspeed.add_config_arguments(parser)

    args = parser.parse_args()

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = read_squad_examples(input_file=args.train_file,
                                             is_training=True)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    # model = BertForQuestionAnswering.from_pretrained(args.bert_model,
    #            cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))

    # Support for word embedding padding checkpoints
    # Prepare model

    bert_model_config = {
        "vocab_size_or_config_json_file": 119547,
        "hidden_size": 1024,
        "num_hidden_layers": 24,
        "num_attention_heads": 16,
        "intermediate_size": 4096,
        "hidden_act": "gelu",
        "hidden_dropout_prob": args.dropout,
        "attention_probs_dropout_prob": args.dropout,
        "hidden_dropout_prob": 0.1,
        "attention_probs_dropout_prob": 0.1,
        "max_position_embeddings": 512,
        "type_vocab_size": 2,
        "initializer_range": 0.02
    }

    if args.ckpt_type == "DS":
        if args.preln:
            bert_config = BertConfigPreLN(**bert_model_config)
        else:
            bert_config = BertConfig(**bert_model_config)
    else:
        # Models from Tensorflow and Huggingface are post-LN.
        if args.preln:
            raise ValueError("Should NOT use --preln if the loading checkpoint doesn't use pre-layer-norm.")

        # Use the original bert config if want to load from non-DeepSpeed checkpoint.
        if args.origin_bert_config_file is None:
            raise ValueError("--origin_bert_config_file is required for loading non-DeepSpeed checkpoint.")

        bert_config = BertConfig.from_json_file(args.origin_bert_config_file)

        if bert_config.vocab_size != len(tokenizer.vocab):
            raise ValueError("vocab size from original checkpoint mismatch.")

    bert_config.vocab_size = len(tokenizer.vocab)
    # Padding for divisibility by 8
    if bert_config.vocab_size % 8 != 0:
        vocab_diff = 8 - (bert_config.vocab_size % 8)
        bert_config.vocab_size += vocab_diff

    if args.preln:
        model = BertForQuestionAnsweringPreLN(bert_config, args)
    else:
        model = BertForQuestionAnswering(bert_config, args)

    print("VOCAB SIZE:", bert_config.vocab_size)
    if args.model_file is not "0":
        logger.info(f"Loading Pretrained Bert Encoder from: {args.model_file}")

        if args.ckpt_type == "DS":
            checkpoint_state_dict = torch.load(args.model_file,
                                               map_location=torch.device("cpu"))
            if 'module' in checkpoint_state_dict:
                logger.info('Loading DeepSpeed v2.0 style checkpoint')
                model.load_state_dict(checkpoint_state_dict['module'],
                                      strict=False)
            elif 'model_state_dict' in checkpoint_state_dict:
                model.load_state_dict(checkpoint_state_dict['model_state_dict'],
                                      strict=False)
            else:
                raise ValueError("Unable to find model state in checkpoint")
        else:
            from convert_bert_ckpt_to_deepspeed import convert_ckpt_to_deepspeed
            convert_ckpt_to_deepspeed(model, args.ckpt_type, args.model_file, vocab_diff, args.deepspeed_transformer_kernel)

        logger.info(f"Pretrained Bert Encoder Loaded from: {args.model_file}")

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    },{
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    model, optimizer, _, _ = deepspeed.initialize(
        args=args,
        model=model,
        model_parameters=optimizer_grouped_parameters,
        dist_init_required=True)
    
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        #torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        os.makedirs(args.output_dir, exist_ok=True)

    # Prepare Summary writer
    if torch.distributed.get_rank() == 0 and args.job_name is not None:
        args.summary_writer = get_summary_writer(name=args.job_name,
                                                 base=args.output_dir)
    else:
        args.summary_writer = None



    logger.info("propagate deepspeed-config settings to client settings")
    args.train_batch_size = model.train_micro_batch_size_per_gpu()
    args.gradient_accumulation_steps = model.gradient_accumulation_steps()
    args.fp16 = model.fp16_enabled()
    args.print_steps = model.steps_per_print()
    args.learning_rate = model.get_lr()[0]
    args.wall_clock_breakdown = model.wall_clock_breakdown()

    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()

    global_step = 0
    if args.do_train:
        cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format(
            list(filter(None, args.bert_model.split('/'))).pop(),
            str(args.max_seq_length), str(args.doc_stride),
            str(args.max_query_length))
        train_features = None
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=True)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s",
                            cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_start_positions,
                                   all_end_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        ema_loss = 0.
        sample_count = 0
        num_epoch = 0
        all_step_time = 0.0
        ave_rounds = 20
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            num_epoch += 1
            epoch_step = 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration", smoothing=0)):
                start_time = time.time()
                bs_size = batch[0].size()[0]
                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch

                loss = model(input_ids, segment_ids, input_mask,
                             start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                ema_loss = args.loss_plot_alpha * ema_loss + (
                    1 - args.loss_plot_alpha) * loss.item()

                model.backward(loss)
                loss_item = loss.item() * args.gradient_accumulation_steps
                loss = None

                sample_count += (args.train_batch_size *
                                 torch.distributed.get_world_size())

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step

                    model.step()
                    global_step += 1
                    epoch_step += 1

                    if torch.distributed.get_rank(
                    ) == 0 and args.summary_writer:
                        summary_events = [
                            (f'Train/Steps/lr', lr_this_step, global_step),
                            (f'Train/Samples/train_loss', loss_item,
                             sample_count),
                            (f'Train/Samples/lr', lr_this_step, sample_count),
                            (f'Train/Samples/train_ema_loss', ema_loss,
                             sample_count)
                        ]

                        if args.fp16 and hasattr(optimizer, 'cur_scale'):
                            summary_events.append(
                                (f'Train/Samples/scale', optimizer.cur_scale,
                                 sample_count))
                        write_summary_events(args.summary_writer,
                                             summary_events)
                        args.summary_writer.flush()

                    if torch.distributed.get_rank() == 0 and (
                            step + 1) % args.print_steps == 0:
                        logger.info(
                            f"bert_squad_progress: step={global_step} lr={lr_this_step} loss={ema_loss}"
                        )
                else:
                    model.step()

                if is_time_to_exit(args=args,
                                   epoch_steps=epoch_step,
                                   global_steps=global_step):
                    logger.info(
                        f'Warning: Early epoch termination due to max steps limit, epoch step ={epoch_step}, global step = {global_step}, epoch = {num_epoch}'
                    )
                    break
                one_step_time = time.time() -start_time
                all_step_time += one_step_time
                if (step + 1)%(ave_rounds) == 0 and torch.distributed.get_rank() == 0:
                    print('At Step {}, Averaged Throughput for {} rounds is: {} Samples/s'.format(step, ave_rounds, bs_size * ave_rounds * torch.distributed.get_world_size() / all_step_time ), flush=True )
                    all_step_time = 0.0
      
    # Save a trained model
    # model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
    #output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    # if args.do_train:
    #    torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model that you have fine-tuned

    #model_state_dict = torch.load(output_model_file)
    #model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict)
    # model.to(device)

    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        eval_examples = read_squad_examples(input_file=args.predict_file,
                                            is_training=False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_example_index)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader, desc="Evaluating"):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(
                    input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, args.verbose_logging)
def main():
    assert pyro.__version__.startswith('1.6.0')
    # Enable smoke test to test functionality
    smoke_test = False

    logging.info(f"CUDA available: {torch.cuda.is_available()}")

    # Loading data
    logging.info("Loading data...")
    docs = prepro_file_load("doc_word_matrix").to_dense()
    doc_categories = prepro_file_load("doc_cat_one_hot_matrix")
    # doc_categories = torch.t(torch.reshape(torch.Tensor(list(prepro_file_load("doc2category").values())), (1, -1)))
    id2word = prepro_file_load("id2word")
    id2cat = prepro_file_load("id2category")

    # Put vocab into dataframe for exploration of data
    vocab = pd.DataFrame(columns=['index', 'word'])
    vocab['index'] = list(id2word.keys())
    vocab['word'] = list(id2word.values())

    logging.info(f"Vocab dictionary size: {len(vocab)}")
    logging.info(f"Corpus size: {docs.shape}")

    # Setting global variables
    seed = 0
    torch.manual_seed(seed)
    pyro.set_rng_seed(seed)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    docs = docs.float()
    doc_categories = doc_categories.float()
    num_categories = len(id2cat)
    num_topics = num_categories * 2 if not smoke_test else 3
    batch_size = 32
    learning_rate = 1e-3
    num_epochs = 50 if not smoke_test else 1

    # Training
    pyro.clear_param_store()

    prodLDA = CategoryProdLDA(vocab_size=docs.shape[1],
                              num_topics=num_topics,
                              num_categories=num_categories,
                              hidden=100 if not smoke_test else 10,
                              dropout=0.2)
    prodLDA.to(device)

    optimizer = pyro.optim.Adam({"lr": learning_rate})
    svi = SVI(prodLDA.model,
              prodLDA.guide,
              optimizer,
              loss=TraceMeanField_ELBO())
    num_batches = int(math.ceil(docs.shape[0] /
                                batch_size)) if not smoke_test else 1

    losses = []

    logging.info("Training...")
    bar = trange(num_epochs)
    for epoch in bar:
        running_loss = 0.0
        for i in range(num_batches):
            batch_docs = docs[i * batch_size:(i + 1) *
                              batch_size, :].to(device)
            batch_cats = doc_categories[i * batch_size:(i + 1) *
                                        batch_size, :].to(device)
            loss = svi.step(batch_docs, batch_cats)
            running_loss += loss / batch_docs.size(0)

        # Save and log losses
        losses.append(running_loss)
        bar.set_postfix(epoch_loss='{:.2e}'.format(running_loss))
        if epoch % 5 == 0:
            logging.info('{: >5d}\t{}'.format(epoch,
                                              '{:.2e}'.format(running_loss)))
    logging.info(f"Final loss: {'{:.2e}'.format(losses[-1])}/{losses[-1]}")

    if not smoke_test:
        # Plot loss over epochs
        plt.plot(losses)
        plt.title("ELBO")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plot_file_name = "../ProdCategoryLDA-loss-2017_categories-" + str(num_categories) + \
                         "_topics-" + str(num_topics) + \
                         "_batch-" + str(batch_size) + \
                         "_lr-" + str(learning_rate) + \
                         "_epochs-" + str(num_epochs) + \
                         ".png"
        plt.savefig(plot_file_name)
        plt.show()

        # Logging top 10 weighted words in topics
        beta = prodLDA.beta()
        for n in range(beta.shape[0]):
            sorted_, indices = torch.sort(beta[n], descending=True)
            df = pd.DataFrame(indices[:10].numpy(), columns=['index'])
            words = pd.merge(df,
                             vocab[['index', 'word']],
                             how='left',
                             on='index')['word'].values.tolist()
            logging.info(f"Topic {n}: {words}")
def train(train_dataset, model, tokenizer, hyperparams):

    verbose = hyperparams["verbose"]
    disable = False if verbose else True

    local_rank = hyperparams["local_rank"]
    per_gpu_train_batch_size = hyperparams["per_gpu_train_batch_size"]
    n_gpu = hyperparams["n_gpu"]
    max_steps = hyperparams["max_steps"]
    num_train_epochs = hyperparams["num_train_epochs"]
    gradient_accumulation_steps = hyperparams["gradient_accumulation_steps"]
    weight_decay = hyperparams["weight_decay"]
    learning_rate = hyperparams["learning_rate"]
    adam_epsilon = hyperparams["adam_epsilon"]
    warmup_steps = hyperparams["warmup_steps"]
    seed = hyperparams["random_state"]
    device = hyperparams["device"]
    model_type = hyperparams["model_type"]
    max_grad_norm = hyperparams["max_grad_norm"]

    save_steps = hyperparams['save_steps']

    output_dir = hyperparams["output_dir"]
    log_path = os.path.join(output_dir, "log.csv")
    fp16_opt_level = hyperparams["fp16_opt_level"]
    fp16 = hyperparams["fp16"]

    model_name_or_path = hyperparams["model_name_or_path"]
    opt_path = os.path.join(model_name_or_path, "optimizer.pt")
    sche_path = os.path.join(model_name_or_path, "scheduler.pt")

    training_logs = {"loss": [], "learning_rate": []}
    train_batch_size = per_gpu_train_batch_size * max(1, n_gpu)

    if local_rank == -1:
        train_sampler = RandomSampler(train_dataset)
    else:
        DistributedSampler(train_dataset)

    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=train_batch_size)

    if max_steps > 0:
        t_total = max_steps
        num_train_epochs = max_steps // (len(train_dataloader) //
                                         gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader) // gradient_accumulation_steps * num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]

    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=learning_rate,
                      eps=adam_epsilon)

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_steps,
                                                num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(opt_path) and os.path.isfile(sche_path):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(opt_path))
        scheduler.load_state_dict(torch.load(sche_path))

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            find_unused_parameters=True)

    # Train!
    logging.info("***** Running training *****")
    logging.info("  Num examples = %d", len(train_dataset))
    logging.info("  Num Epochs = %d", num_train_epochs)
    logging.info("  Instantaneous batch size per GPU = %d",
                 per_gpu_train_batch_size)
    logging.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        train_batch_size * gradient_accumulation_steps *
        (torch.distributed.get_world_size() if local_rank != -1 else 1))
    logging.info("  Gradient Accumulation steps = %d",
                 gradient_accumulation_steps)
    logging.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0

    # Check if continuing training from a checkpoint
    if os.path.exists(
            model_name_or_path) and model_name_or_path.find("checkpoints") > 0:
        # set global_step to gobal_step of last saved checkpoint from model
        # path
        global_step = int(model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) //
                                         gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (
            len(train_dataloader) // gradient_accumulation_steps)

        logging.info(
            "  Continuing training from checkpoint, will skip to saved global_step"
        )
        logging.info("  Continuing training from epoch %d", epochs_trained)
        logging.info("  Continuing training from global step %d", global_step)
        logging.info("  Will skip the first %d steps in the first epoch",
                     steps_trained_in_current_epoch)

    tr_loss = 0.0
    model.zero_grad()
    set_seed(seed, n_gpu=n_gpu)  # Added here for reproductibility

    train_iterator = trange(epochs_trained,
                            int(num_train_epochs),
                            desc="Epoch",
                            disable=disable)

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=disable)

        for step, batch in enumerate(epoch_iterator):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3]
            }
            # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
            if model_type != "distilbert":
                inputs["token_type_ids"] = (batch[2] if model_type in [
                    "bert", "xlnet", "albert"
                ] else None)
            outputs = model(**inputs)
            loss = outputs[0]
            if n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            training_logs["loss"].append(loss.item())
            training_logs["learning_rate"].append(scheduler.get_last_lr()[0])
            if (step + 1) % gradient_accumulation_steps == 0:
                if fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

            if local_rank in [
                    -1, 0
            ] and save_steps > 0 and global_step % save_steps == 0:
                # Save model checkpoint
                output_dir = os.path.join(output_dir,
                                          "checkpoint-{}".format(global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                # Take care of distributed/parallel training
                model_to_save = (model.module
                                 if hasattr(model, "module") else model)
                model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                torch.save(
                    hyperparams,
                    os.path.join(output_dir, "training_hyperparams.bin"))
                logging.info("Saving model checkpoint to %s", output_dir)
                torch.save(optimizer.state_dict(),
                           os.path.join(output_dir, "optimizer.pt"))
                torch.save(scheduler.state_dict(),
                           os.path.join(output_dir, "scheduler.pt"))
                logging.info("Saving optimizer and scheduler states to %s",
                             output_dir)

            if max_steps > 0 and global_step > max_steps:
                epoch_iterator.close()
                break
        if max_steps > 0 and global_step > max_steps:
            train_iterator.close()
            break

    training_logs = pd.DataFrame(training_logs)
    training_logs.to_csv(log_path, index=False)
    return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        # here set log dir for visulization
        tb_writer = SummaryWriter(comment=args.log_comment)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        
    # if freeze BERT parameters
    if args.freeze_bert:
        for params in model.bert.parameters():
            params.requires_grad = False

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)

    if args.scheduler == "linear":
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
    elif args.scheduler == "cosine":
        scheduler = WarmupCosineSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)


    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    args.logging_steps = t_total // args.num_train_epochs
    logger.info("  Logging steps = %d", args.logging_steps)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad() 
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {'input_ids':       batch[0], 
                      'attention_mask':  batch[1],
                      'start_positions': batch[3],
                      'end_positions':   batch[4],
                      'concept_ids': batch[7],
                      'concept_masks': batch[8]}
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[5],
                               'p_mask':       batch[6]})
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1


                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Exemple #39
0
def train(args, train_dataset, model):
    # use tensorboard to keep track of training process
    tb_writer = SummaryWriter('loss')

    #train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.train_batch_size,
                                  shuffle=True)

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=len(train_dataloader) * args.num_train_epochs)

    # Train!
    logging.info("***** Running training *****")
    #logging.info("  Num examples = %d", len(train_dataset))
    logging.info("  Num Epochs = %d", args.num_train_epochs)

    logging.info("  Let's start finetuning!")
    tr_loss, logging_loss = 0.0, 0.0
    global_step = 0
    for epoch in tqdm.trange(args.num_train_epochs, desc='Epoch'):
        #epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in tqdm.tqdm(enumerate(train_dataloader)):
            model.train()

            outputs = model(
                is_training=True,
                input_ids=batch["input_ids"].long().to(DEVICE),
                attention_mask=batch['input_mask'].long().to(DEVICE),
                token_type_ids=batch['segment_ids'].long().to(DEVICE),
                start_positions=batch["start_positions"].long().to(DEVICE),
                end_positions=batch["end_positions"].long().to(DEVICE),
                answer_types=batch["answer_types"].long().to(DEVICE))

            loss = outputs[-1]
            if args.grad_acc_steps > 1:
                loss = loss / args.grad_acc_steps

            loss.backward()
            tr_loss += loss.item()
            if (step + 1) % args.grad_acc_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

                if global_step % args.logging_steps == 0:
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    print('loss',
                          (tr_loss - logging_loss) / args.logging_steps)
                    logging_loss = tr_loss
            #empty cahce
            del batch
            torch.cuda.empty_cache()
            #loggin points

        # save checkpoint
        #if global_step % args.save_steps == 0:
        output_dir = os.path.join(args.output_dir,
                                  "checkpoint-3{}".format(global_step))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            # Take care of distributed/parallel training
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(output_dir)

        torch.save(args, os.path.join(output_dir, "training_args.bin"))
        logging.info("Saving model checkpoint to %s", output_dir)

        #torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
        #logging.info("Saving optimizer and scheduler states to %s", output_dir)

    return global_step, tr_loss / global_step
Exemple #40
0
    def run(self, thunk, num_cpu=1, data_dir=None, datestamp=False):
        """
        Run each variant in the grid with function 'thunk'.

        Note: 'thunk' must be either a callable function, or a string. If it is
        a string, it must be the name of a parameter whose values are all 
        callable functions.

        Uses ``call_experiment`` to actually launch each experiment, and gives
        each variant a name using ``self.variant_name()``. 

        Maintenance note: the args for ExperimentGrid.run should track closely
        to the args for call_experiment. However, ``seed`` is omitted because
        we presume the user may add it as a parameter in the grid.
        """

        # Print info about self.
        self.print()

        # Make the list of all variants.
        variants = self.variants()

        # Print variant names for the user.
        var_names = set([self.variant_name(var) for var in variants])
        var_names = sorted(list(var_names))
        line = '='*DIV_LINE_WIDTH
        preparing = colorize('Preparing to run the following experiments...', 
                             color='green', bold=True)
        joined_var_names = '\n'.join(var_names)
        announcement = f"\n{preparing}\n\n{joined_var_names}\n\n{line}"
        print(announcement)


        if WAIT_BEFORE_LAUNCH > 0:
            delay_msg = colorize(dedent("""
            Launch delayed to give you a few seconds to review your experiments.

            To customize or disable this behavior, change WAIT_BEFORE_LAUNCH in
            spinup/user_config.py.

            """), color='cyan', bold=True)+line
            print(delay_msg)
            wait, steps = WAIT_BEFORE_LAUNCH, 100
            prog_bar = trange(steps, desc='Launching in...', 
                              leave=False, ncols=DIV_LINE_WIDTH, 
                              mininterval=0.25,
                              bar_format='{desc}: {bar}| {remaining} {elapsed}')
            for _ in prog_bar:
                time.sleep(wait/steps)

        # Run the variants.
        for var in variants:
            exp_name = self.variant_name(var)

            # Figure out what the thunk is.
            if isinstance(thunk, str):
                # Assume one of the variant parameters has the same
                # name as the string you passed for thunk, and that 
                # variant[thunk] is a valid callable function.
                thunk_ = var[thunk]
                del var[thunk]
            else:
                # Assume thunk is given as a function.
                thunk_ = thunk

            call_experiment(exp_name, thunk_, num_cpu=num_cpu, 
                            data_dir=data_dir, datestamp=datestamp, **var)
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    warm_up_steps=int(args.warmup_steps*t_total)
    logging_steps=int(args.logging_steps*t_total)
    save_steps=int(args.save_steps*t_total)
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warm_up_steps, t_total=t_total)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    max_acc=0
    max_f1=0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'align_mask': batch[2],
                      'labels':         batch[4]}
            inputs['token_type_ids'] = batch[3]
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if logging_steps > 0 and global_step % logging_steps == 0:
                    if args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            if key=="acc":
                                max_acc=max([max_acc,value])
                                with open(os.path.join(args.output_dir, "acc.txt"), 'a+') as w:
                                    w.write("%d\t%f\t%f\n" % (global_step, value, max_acc))
                            if key == "f1":
                                max_f1=max([max_f1,value])
                                with open(os.path.join(args.output_dir, "f1.txt"), 'a+') as w:
                                    w.write("%d\t%f\t%f\n" % (global_step, value, max_f1))
                    with open(os.path.join(args.output_dir, "loss.txt"), 'a+') as w:
                        w.write("%d\t%f\n"%(global_step, (tr_loss - logging_loss) / logging_steps))
                    logging_loss = tr_loss

                if save_steps > 0 and global_step % save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    return global_step, tr_loss / global_step
Exemple #42
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    tb_writer = SummaryWriter()

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Train batch size = %d",
                args.train_batch_size * args.gradient_accumulation_steps)
    logger.info("  Gradient accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    #steps_trained_in_current_epoch = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            position=0,
                            leave=True,
                            ncols=100)
    set_seed(args)

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              position=0,
                              leave=True,
                              ncols=100)
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2],
                'start_positions': batch[3],
                'end_positions': batch[4],
                'cls_index': batch[5],
                'p_mask': batch[6],
                'task': 2,
            }

            outputs = model(**inputs)
            loss = outputs[0]

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()
            tr_loss += loss.item()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.evaluate_during_training:
                        results = evalute(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logging.info("Saving optimizer and scheduler states to %s",
                                 output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    tb_writer.close()

    return global_step, tr_loss / global_step
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",
                        default=False,
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    processors = {
        "cola": ColaProcessor,
        "snli": SnliProcessor,
        "mrpc": MrpcProcessor,
        "ant": AntProcessor,
        'buy_data': BuyProcessor,
        'buy_mt': BuyMTProcessor,
        'sent_clf': SingleSentProcessor,
        'douban': DoubanProcessor,
        'keyword': KeywordProcessor,
    }

    num_labels_task = {
        "cola": 2,
        "snli": 3,
        "mrpc": 2,
        "ant": 2,
        'buy_data': 2,
        'buy_mt': 2,
        'sent_clf': 2,
        'douban': 2,
        'keyword': 2,
    }

    global device
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    # if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
    #     raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    global processor
    global label_list
    global tokenizer
    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = 0
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    model = BertForSequenceClassification.from_pretrained(args.bert_model,
                                                          cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(
                                                              args.local_rank),
                                                          num_labels=num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            # from apex.parallel import DistributedDataParallel as DDP
            from torch.nn.parallel import DistributedDataParallel as DDP  # 不采用nvidia的apex包
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    global global_step
    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        epoch_idx = 0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            epoch_idx += 1
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            start = time.time()
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                # if step % 10000 == 0:
                # 	print('step {} | loss {} | spend {} s'.format(step, loss, time.time() - start))
                # 	start = time.time()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            # Save a trained model
            model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
            output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME + '_epoch{}'.format(epoch_idx))
            torch.save(model_to_save.state_dict(), output_model_file)
            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
            with open(output_config_file, 'w') as f:
                f.write(model_to_save.config.to_json_string())

            if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
                eval(model, args, epoch_idx, tr_loss / nb_tr_steps)
    else:
        # Load a trained model that you have fine-tuned
        if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
            eval(model, args,  -10000, -10000)
Exemple #44
0
def train_model(rank, world_size, args):
    """ 모델 학습 """
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)
    if master and args.wandb:
        wandb.init(project=args.project)

    vocab = load_vocab(args.vocab)

    config = Config.load(args.config)
    config.n_enc_vocab = len(vocab)
    config.device = f"cuda:{rank}" if torch.cuda.is_available() else "cpu"
    print(config)

    best_epoch, best_loss, best_score = 0, 0, 0
    train_model = ALBertTrainMovie(config)
    if os.path.isfile(args.save):
        try:
            best_epoch, best_loss, best_score = train_model.load(args.save)
            print(
                f"rank: {rank} load state dict from: {os.path.basename(args.save)}"
            )
        except:
            print(f'load {os.path.basename(args.save)} failed.')
    elif os.path.isfile(args.pretrain_save):
        try:
            epoch, loss = train_model.bert.load(args.pretrain_save)
            print(
                f"rank: {rank} load pretrain from: {os.path.basename(args.pretrain_save)}, epoch={epoch}, loss={loss}"
            )
        except:
            print(f'load {os.path.basename(args.pretrain_save)} failed.')

    if 1 < args.n_gpu:
        train_model.to(config.device)
        # noinspection PyArgumentList
        train_model = DistributedDataParallel(train_model,
                                              device_ids=[rank],
                                              find_unused_parameters=True)
    else:
        train_model.to(config.device)

    if master and args.wandb:
        wandb.watch(train_model)

    criterion_cls = torch.nn.CrossEntropyLoss()

    train_loader: DataLoader = data.build_data_loader(vocab,
                                                      args.train,
                                                      args,
                                                      data_type='train',
                                                      shuffle=True)
    test_loader: DataLoader = data.build_data_loader(vocab,
                                                     args.test,
                                                     args,
                                                     data_type='test',
                                                     shuffle=False)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in train_model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in train_model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=config.learning_rate,
                            eps=config.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.warmup_steps,
        num_training_steps=t_total)

    start_epoch = best_epoch + 1
    with trange(args.epoch, desc="Epoch", position=0) as pbar:
        pbar.set_postfix_str(
            f"best epoch: {best_epoch}, loss: {best_loss:.4f}, accuracy: {best_score:.3f}"
        )
        for step in pbar:
            epoch = step + start_epoch

            loss = train_epoch(config, rank, train_model, criterion_cls,
                               optimizer, scheduler, train_loader)
            score = eval_epoch(config, rank, train_model, test_loader)
            if master and args.wandb:
                wandb.log({"loss": loss, "accuracy": score})

            if master and best_score < score:
                best_epoch, best_loss, best_score = epoch, loss, score
                if isinstance(train_model, DistributedDataParallel):
                    train_model.module.save(best_epoch, best_loss, best_score,
                                            args.save)
                else:
                    train_model.save(best_epoch, best_loss, best_score,
                                     args.save)

                pbar.set_postfix_str(
                    f"best epoch: {best_epoch}, loss: {best_loss:.4f}, accuracy: {best_score:.3f}"
                )

    if 1 < args.n_gpu:
        destroy_process_group()
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
        os.path.join(args.model_name_or_path, "scheduler.pt")
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info(
                "  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch,
            )
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
    )
    # Added here for reproductibility
    set_seed(args)

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
                del inputs["token_type_ids"]

            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
                if hasattr(model, "config") and hasattr(model.config, "lang2id"):
                    inputs.update(
                        {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
                    )

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar(
                        "loss", (tr_loss - logging_loss) / args.logging_steps, global_step,
                    )
                    logging_loss = tr_loss

                # Save model checkpoint
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(model, "module") else model
                    if args.train_adapter:
                        model_to_save.save_all_adapters(output_dir)
                    else:
                        model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Exemple #46
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    # kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  num_workers=1,
                                  pin_memory=True)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    if args.cuda:
        # Move model to GPU.
        model.cuda()

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate * hvd.size(),
                      eps=args.adam_epsilon)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression)

    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    print("############# Start to create model ###################")
    # multi-gpu training (should be after apex fp16 initialization)
    # if args.n_gpu > 1:
    #     print("############# DataParallel ###################")
    #     model = torch.nn.DataParallel(model)

    #     # Distributed training (should be after apex fp16 initialization)
    #     if args.local_rank != -1:
    #         print("############# DistributedDataParallel ###################")
    #         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
    #                                                           output_device=args.local_rank,
    #                                                           find_unused_parameters=True)
    # else:
    #     print("############# Normal ###################")
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (hvd.size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])

    set_seed(
        args)  # Added here for reproducibility (even between python 2 and 3)
    for _ in range(int(args.num_train_epochs)):

        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        step_count = 0
        aa = time.time()
        for step, batch in enumerate(train_dataloader):
            a = time.time()
            inputs, labels = mask_tokens(batch, tokenizer,
                                         args) if args.mlm else (batch, batch)
            inputs, labels = inputs.cuda(), labels.cuda()
            # inputs = inputs.to(args.device)
            # labels = labels.to(args.device)
            model.train()
            outputs = model(inputs,
                            masked_lm_labels=labels) if args.mlm else model(
                                inputs, labels=labels)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = 'checkpoint'
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        '{}-{}'.format(checkpoint_prefix, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)
            step_count += 1
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
            b = time.time()
            if step_count % 100 == 20:
                print("***** Training time: ", b - a, "; Step: ", step_count,
                      "; loss: ", tr_loss / step_count, "*****")
        bb = time.time()
        print("***** Total Training time: ", bb - aa, "; Step: ", step_count,
              "; loss: ", tr_loss / step_count, "*****")
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model_to_resize = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                print(f"Step: {step}, Loss: {loss.item()}")

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
if args.local_rank in [-1, 0]:
    tb_writer = SummaryWriter(args.exp_name)

encoder.zero_grad()
model.zero_grad()

###++++++++++++++++++++++++++++++++++++++++++
total_batch_num = len(train_dataloader)
logger.info('Total number of batches = {}'.format(total_batch_num))
eval_batch_interval_num = int(total_batch_num * args.eval_interval_ratio) + 1
logger.info(
    'Evaluate the model by = {} batches'.format(eval_batch_interval_num))
###++++++++++++++++++++++++++++++++++++++++++

train_iterator = trange(start_epoch,
                        start_epoch + int(args.num_train_epochs),
                        desc="Epoch",
                        disable=args.local_rank not in [-1, 0])
for epoch in train_iterator:
    epoch_iterator = tqdm(train_dataloader,
                          desc="Iteration",
                          disable=args.local_rank not in [-1, 0])
    for step, batch in enumerate(epoch_iterator):
        encoder.train()
        model.train()
        #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        for key, value in batch.items():
            if key not in {'ids'}:
                batch[key] = value.to(args.device)
        #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        inputs = {
            'input_ids':
    mnist.data[:60000] = mnist.data[reorder_train]
    mnist.target[:60000] = mnist.target[reorder_train]
    mnist.data[60000:] = mnist.data[reorder_test + 60000]
    mnist.target[60000:] = mnist.target[reorder_test + 60000]

# Get MNIST data, normalize, and divide by level
# mnist = fetch_openml('MNIST original', data_home='./data')
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8)  # fetch_openml() returns targets as strings
sort_by_target(mnist)  # fetch_openml() returns an unsorted dataset

mu = np.mean(mnist.data.astype(np.float32), 0)
sigma = np.std(mnist.data.astype(np.float32), 0)
mnist.data = (mnist.data.astype(np.float32) - mu)/(sigma+0.001)
mnist_data = []
for i in trange(10):
    idx = mnist.target==i
    mnist_data.append(mnist.data[idx])

print([len(v) for v in mnist_data])

###### CREATE USER DATA SPLIT #######
# Assign 10 samples to each user
X = [[] for _ in range(1000)]
y = [[] for _ in range(1000)]
idx = np.zeros(10, dtype=np.int64)
for user in range(1000):
    for j in range(2):
        l = (user+j)%10
        X[user] += mnist_data[l][idx[l]:idx[l]+5].tolist()
        y[user] += (l*np.ones(5)).tolist()
def train(args, train_dataset, model, tokenizer, criterion):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=args.train_batch_size,
        collate_fn=collate_fn,
        num_workers=args.num_workers,
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_f1, n_no_improve = 0, 0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            labels = batch[5]
            inputs = {
                "input_ids": batch[0],
                "input_modal": batch[2],
                "attention_mask": batch[1],
                "modal_start_tokens": batch[3],
                "modal_end_tokens": batch[4],
            }
            outputs = model(**inputs)
            logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
            loss = criterion(logits, labels)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer, criterion)
                        for key, value in results.items():
                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs["learning_rate"] = learning_rate_scalar
                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    print(json.dumps({**logs, **{"step": global_step}}))

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME))
                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

        if args.local_rank == -1:
            results = evaluate(args, model, tokenizer, criterion)
            if results["micro_f1"] > best_f1:
                best_f1 = results["micro_f1"]
                n_no_improve = 0
            else:
                n_no_improve += 1

            if n_no_improve > args.patience:
                train_iterator.close()
                break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Exemple #51
0
def main(params):
    model_output_path = params["output_path"]
    if not os.path.exists(model_output_path):
        os.makedirs(model_output_path)
    logger = utils.get_logger(params["output_path"])

    # Init model
    reranker = CrossEncoderRanker(params)
    tokenizer = reranker.tokenizer
    model = reranker.model

    # utils.save_model(model, tokenizer, model_output_path)

    device = reranker.device
    n_gpu = reranker.n_gpu

    if params["gradient_accumulation_steps"] < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(params["gradient_accumulation_steps"]))

    # An effective batch size of `x`, when we are accumulating the gradient accross `y` batches will be achieved by having a batch size of `z = x / y`
    # args.gradient_accumulation_steps = args.gradient_accumulation_steps // n_gpu
    params["train_batch_size"] = (params["train_batch_size"] //
                                  params["gradient_accumulation_steps"])
    train_batch_size = params["train_batch_size"]
    eval_batch_size = params["eval_batch_size"]
    grad_acc_steps = params["gradient_accumulation_steps"]

    # Fix the random seeds
    seed = params["seed"]
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if reranker.n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

    max_seq_length = params["max_seq_length"]
    context_length = params["max_context_length"]

    fname = os.path.join(params["data_path"], "train.t7")
    train_data = torch.load(fname)
    context_input = train_data["context_vecs"]
    candidate_input = train_data["cand_vecs"]
    label_input = train_data["labels"]
    if params["debug"]:
        max_n = 200
        context_input = context_input[:max_n]
        candidate_input = candidate_input[:max_n]
        label_input = label_input[:max_n]

    context_input = modify(context_input, candidate_input, max_seq_length)

    train_tensor_data = TensorDataset(context_input, label_input)
    train_sampler = RandomSampler(train_tensor_data)

    train_dataloader = DataLoader(train_tensor_data,
                                  sampler=train_sampler,
                                  batch_size=params["train_batch_size"])

    max_n = 2048
    if params["debug"]:
        max_n = 200
    fname = os.path.join(params["data_path"], "valid.t7")
    valid_data = torch.load(fname)
    context_input = valid_data["context_vecs"][:max_n]
    candidate_input = valid_data["cand_vecs"][:max_n]
    label_input = valid_data["labels"][:max_n]

    context_input = modify(context_input, candidate_input, max_seq_length)

    valid_tensor_data = TensorDataset(context_input, label_input)
    valid_sampler = SequentialSampler(valid_tensor_data)

    valid_dataloader = DataLoader(valid_tensor_data,
                                  sampler=valid_sampler,
                                  batch_size=params["eval_batch_size"])

    # evaluate before training
    results = evaluate(
        reranker,
        valid_dataloader,
        device=device,
        logger=logger,
        context_length=context_length,
        silent=params["silent"],
    )

    number_of_samples_per_dataset = {}

    time_start = time.time()

    utils.write_to_file(os.path.join(model_output_path, "training_params.txt"),
                        str(params))

    logger.info("Starting training")
    logger.info("device: {} n_gpu: {}, distributed training: {}".format(
        device, n_gpu, False))

    optimizer = get_optimizer(model, params)
    scheduler = get_scheduler(params, optimizer, len(train_tensor_data),
                              logger)

    model.train()

    best_epoch_idx = -1
    best_score = -1

    num_train_epochs = params["num_train_epochs"]

    for epoch_idx in trange(int(num_train_epochs), desc="Epoch"):
        tr_loss = 0
        results = None

        if params["silent"]:
            iter_ = train_dataloader
        else:
            iter_ = tqdm(train_dataloader, desc="Batch")

        part = 0
        for step, batch in enumerate(iter_):
            batch = tuple(t.to(device) for t in batch)
            context_input, label_input = batch
            loss, _ = reranker(context_input, label_input, context_length)

            # if n_gpu > 1:
            #     loss = loss.mean() # mean() to average on multi-gpu.

            if grad_acc_steps > 1:
                loss = loss / grad_acc_steps

            tr_loss += loss.item()

            if (step + 1) % (params["print_interval"] * grad_acc_steps) == 0:
                logger.info("Step {} - epoch {} average loss: {}\n".format(
                    step,
                    epoch_idx,
                    tr_loss / (params["print_interval"] * grad_acc_steps),
                ))
                tr_loss = 0

            loss.backward()

            if (step + 1) % grad_acc_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               params["max_grad_norm"])
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            if (step + 1) % (params["eval_interval"] * grad_acc_steps) == 0:
                logger.info("Evaluation on the development dataset")
                evaluate(
                    reranker,
                    valid_dataloader,
                    device=device,
                    logger=logger,
                    context_length=context_length,
                    silent=params["silent"],
                )
                logger.info("***** Saving fine - tuned model *****")
                epoch_output_folder_path = os.path.join(
                    model_output_path, "epoch_{}_{}".format(epoch_idx, part))
                part += 1
                utils.save_model(model, tokenizer, epoch_output_folder_path)
                model.train()
                logger.info("\n")

        logger.info("***** Saving fine - tuned model *****")
        epoch_output_folder_path = os.path.join(model_output_path,
                                                "epoch_{}".format(epoch_idx))
        utils.save_model(model, tokenizer, epoch_output_folder_path)
        # reranker.save(epoch_output_folder_path)

        output_eval_file = os.path.join(epoch_output_folder_path,
                                        "eval_results.txt")
        results = evaluate(
            reranker,
            valid_dataloader,
            device=device,
            logger=logger,
            context_length=context_length,
            silent=params["silent"],
        )

        ls = [best_score, results["normalized_accuracy"]]
        li = [best_epoch_idx, epoch_idx]

        best_score = ls[np.argmax(ls)]
        best_epoch_idx = li[np.argmax(ls)]
        logger.info("\n")

    execution_time = (time.time() - time_start) / 60
    utils.write_to_file(
        os.path.join(model_output_path, "training_time.txt"),
        "The training took {} minutes\n".format(execution_time),
    )
    logger.info("The training took {} minutes\n".format(execution_time))

    # save the best model in the parent_dir
    logger.info("Best performance in epoch: {}".format(best_epoch_idx))
    params["path_to_model"] = os.path.join(model_output_path,
                                           "epoch_{}".format(best_epoch_idx))
Exemple #52
0
import numpy as np
from skimage.data import imread
import matplotlib.pyplot as plt
from skimage import io
#Get all file list
from os import listdir

directory = "realpages/"

file_list = listdir(directory)

save_directory = "croppages/"

#Loop all files

for i in trange(len(file_list)):
#for i in range(0, 10):
    #Read data
    image_file = directory + file_list[i]
    im = imread(image_file)
    #plt.figure(figsize=(30, 20))
    #plt.imshow(im, cmap='gray')
    #Get shape
    
    shape = im.shape
    #print(shape)
    
    crop_im = im[17:shape[0], 10:shape[1] - 17]
    
    #plt.figure(figsize=(30, 20))
    #plt.imshow(crop_im, cmap='gray')
Exemple #53
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--src_file",
                        default=None,
                        type=str,
                        help="The input data file name.")
    parser.add_argument("--tgt_file",
                        default=None,
                        type=str,
                        help="The output data file name.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--config_path",
                        default=None,
                        type=str,
                        help="Bert config file path.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        "--log_dir",
        default='',
        type=str,
        required=True,
        help="The output directory where the log will be written.")
    parser.add_argument("--model_recover_path",
                        default=None,
                        type=str,
                        required=True,
                        help="The file of fine-tuned pretraining model.")
    parser.add_argument("--optim_recover_path",
                        default=None,
                        type=str,
                        help="The file of pretraining optimizer.")

    # Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--label_smoothing",
                        default=0,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="The weight decay rate for Adam.")
    parser.add_argument("--finetune_decay",
                        action='store_true',
                        help="Weight decay to the original weights.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--hidden_dropout_prob",
                        default=0.1,
                        type=float,
                        help="Dropout rate for hidden states.")
    parser.add_argument("--attention_probs_dropout_prob",
                        default=0.1,
                        type=float,
                        help="Dropout rate for attention probabilities.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp32_embedding',
        action='store_true',
        help=
        "Whether to use 32-bit float precision instead of 16-bit for embeddings"
    )
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--amp',
                        action='store_true',
                        help="Whether to use amp for fp16")
    parser.add_argument(
        '--from_scratch',
        action='store_true',
        help=
        "Initialize parameters with random values (i.e., training from scratch)."
    )
    parser.add_argument('--new_segment_ids',
                        action='store_true',
                        help="Use new segment ids for bi-uni-directional LM.")
    parser.add_argument('--new_pos_ids',
                        action='store_true',
                        help="Use new position ids for LMs.")
    parser.add_argument('--tokenized_input',
                        action='store_true',
                        help="Whether the input is tokenized.")
    parser.add_argument('--max_len_a',
                        type=int,
                        default=0,
                        help="Truncate_config: maximum length of segment A.")
    parser.add_argument('--max_len_b',
                        type=int,
                        default=0,
                        help="Truncate_config: maximum length of segment B.")
    parser.add_argument(
        '--trunc_seg',
        default='',
        help="Truncate_config: first truncate segment A/B (option: a, b).")
    parser.add_argument(
        '--always_truncate_tail',
        action='store_true',
        help="Truncate_config: Whether we should always truncate tail.")
    parser.add_argument(
        "--mask_prob",
        default=0.15,
        type=float,
        help=
        "Number of prediction is sometimes less than max_pred when sequence is short."
    )
    parser.add_argument(
        "--mask_prob_eos",
        default=0,
        type=float,
        help=
        "Number of prediction is sometimes less than max_pred when sequence is short."
    )
    parser.add_argument('--max_pred',
                        type=int,
                        default=20,
                        help="Max tokens of prediction.")
    parser.add_argument("--num_workers",
                        default=0,
                        type=int,
                        help="Number of workers for the data loader.")

    parser.add_argument('--mask_source_words',
                        action='store_true',
                        help="Whether to mask source words for training")
    parser.add_argument('--skipgram_prb',
                        type=float,
                        default=0.0,
                        help='prob of ngram mask')
    parser.add_argument('--skipgram_size',
                        type=int,
                        default=1,
                        help='the max size of ngram mask')
    parser.add_argument('--mask_whole_word',
                        action='store_true',
                        help="Whether masking a whole word.")
    parser.add_argument('--do_l2r_training',
                        action='store_true',
                        help="Whether to do left to right training")
    parser.add_argument(
        '--has_sentence_oracle',
        action='store_true',
        help="Whether to have sentence level oracle for training. "
        "Only useful for summary generation")
    parser.add_argument('--max_position_embeddings',
                        type=int,
                        default=None,
                        help="max position embeddings")
    parser.add_argument('--relax_projection',
                        action='store_true',
                        help="Use different projection layers for tasks.")
    parser.add_argument('--ffn_type',
                        default=0,
                        type=int,
                        help="0: default mlp; 1: W((Wx+b) elem_prod x);")
    parser.add_argument('--num_qkv',
                        default=0,
                        type=int,
                        help="Number of different <Q,K,V>.")
    parser.add_argument('--seg_emb',
                        action='store_true',
                        help="Using segment embedding for self-attention.")
    parser.add_argument(
        '--s2s_special_token',
        action='store_true',
        help="New special tokens ([S2S_SEP]/[S2S_CLS]) of S2S.")
    parser.add_argument('--s2s_add_segment',
                        action='store_true',
                        help="Additional segmental for the encoder of S2S.")
    parser.add_argument(
        '--s2s_share_segment',
        action='store_true',
        help=
        "Sharing segment embeddings for the encoder of S2S (used with --s2s_add_segment)."
    )
    parser.add_argument('--pos_shift',
                        action='store_true',
                        help="Using position shift for fine-tuning.")
    parser.add_argument(
        "--experiment",
        type=str,
        default="full",
        help=
        "1.full (title + full abstract) 2.title (only title), 3.title-l1 (title + l1), 4. single 5. segsep"
    )

    args = parser.parse_args()

    assert Path(
        args.model_recover_path).exists(), "--model_recover_path doesn't exist"

    args.output_dir = args.output_dir.replace('[PT_OUTPUT_DIR]',
                                              os.getenv('PT_OUTPUT_DIR', ''))
    args.log_dir = args.log_dir.replace('[PT_OUTPUT_DIR]',
                                        os.getenv('PT_OUTPUT_DIR', ''))

    os.makedirs(args.output_dir, exist_ok=True)
    os.makedirs(args.log_dir, exist_ok=True)
    json.dump(args.__dict__,
              open(os.path.join(args.output_dir, 'opt.json'), 'w'),
              sort_keys=True,
              indent=2)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        dist.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if args.local_rank not in (-1, 0):
        # Make sure only the first process in distributed training will download model & vocab
        dist.barrier()
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    if args.max_position_embeddings:
        tokenizer.max_len = args.max_position_embeddings
    data_tokenizer = WhitespaceTokenizer(
    ) if args.tokenized_input else tokenizer
    if args.local_rank == 0:
        dist.barrier()

    DatasetFunc = ConcatDataset
    processor = seq2seq_loader.Preprocess4Seq2seq
    if args.experiment == "title":
        DatasetFunc = TitleDataset
    elif args.experiment == "title-l1":
        DatasetFunc = TitleLead1Dataset
    elif args.experiment == "single":
        DatasetFunc = SingleTrainingDataset
    elif args.experiment == "title-first":
        DatasetFunc = TitleFirstDataset
    elif args.experiment == "segsep":
        DatasetFunc = SegSepDataset
        processor = Preprocess4SegSep

    if args.do_train:
        print("Loading Train Dataset", args.data_dir)
        bi_uni_pipeline = [
            processor(args.max_pred,
                      args.mask_prob,
                      list(tokenizer.vocab.keys()),
                      tokenizer.convert_tokens_to_ids,
                      args.max_seq_length,
                      new_segment_ids=args.new_segment_ids,
                      truncate_config={
                          'max_len_a': args.max_len_a,
                          'max_len_b': args.max_len_b,
                          'trunc_seg': args.trunc_seg,
                          'always_truncate_tail': args.always_truncate_tail
                      },
                      mask_source_words=args.mask_source_words,
                      skipgram_prb=args.skipgram_prb,
                      skipgram_size=args.skipgram_size,
                      mask_whole_word=args.mask_whole_word,
                      mode="s2s",
                      has_oracle=args.has_sentence_oracle,
                      num_qkv=args.num_qkv,
                      s2s_special_token=args.s2s_special_token,
                      s2s_add_segment=args.s2s_add_segment,
                      s2s_share_segment=args.s2s_share_segment,
                      pos_shift=args.pos_shift)
        ]
        file_oracle = None
        if args.has_sentence_oracle:
            file_oracle = os.path.join(args.data_dir, 'train.oracle')
        fn_src = os.path.join(args.data_dir,
                              args.src_file if args.src_file else 'train.src')
        fn_tgt = os.path.join(args.data_dir,
                              args.tgt_file if args.tgt_file else 'train.tgt')
        train_dataset = DatasetFunc(fn_src,
                                    fn_tgt,
                                    args.train_batch_size,
                                    data_tokenizer,
                                    args.max_seq_length,
                                    args.max_len_b,
                                    bi_uni_pipeline=bi_uni_pipeline)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset, replacement=False)
            _batch_size = args.train_batch_size
        else:
            train_sampler = DistributedSampler(train_dataset)
            _batch_size = args.train_batch_size // dist.get_world_size()
        train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=_batch_size,
            sampler=train_sampler,
            num_workers=args.num_workers,
            collate_fn=seq2seq_loader.batch_list_to_batch_tensors,
            pin_memory=False)

    # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps)
    # t_total = int(math.ceil(len(train_dataset.ex_list) / args.train_batch_size)
    t_total = int(
        len(train_dataloader) * args.num_train_epochs /
        args.gradient_accumulation_steps)

    amp_handle = None
    if args.fp16 and args.amp:
        from apex import amp
        amp_handle = amp.init(enable_caching=True)
        logger.info("enable fp16 with amp")

    # Prepare model
    recover_step = _get_max_epoch_model(args.output_dir)
    cls_num_labels = 2
    type_vocab_size = 6 + \
        (1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2

    if args.experiment == "segsep":
        type_vocab_size = 11  # for the largest dataset only have 10 papers

    num_sentlvl_labels = 2 if args.has_sentence_oracle else 0
    relax_projection = 4 if args.relax_projection else 0
    if args.local_rank not in (-1, 0):
        # Make sure only the first process in distributed training will download model & vocab
        dist.barrier()
    if (recover_step is None) and (args.model_recover_path is None):
        # if _state_dict == {}, the parameters are randomly initialized
        # if _state_dict == None, the parameters are initialized with bert-init
        _state_dict = {} if args.from_scratch else None
        model = BertForPreTrainingLossMask.from_pretrained(
            args.bert_model,
            state_dict=_state_dict,
            num_labels=cls_num_labels,
            num_rel=0,
            type_vocab_size=type_vocab_size,
            config_path=args.config_path,
            task_idx=3,
            num_sentlvl_labels=num_sentlvl_labels,
            max_position_embeddings=args.max_position_embeddings,
            label_smoothing=args.label_smoothing,
            fp32_embedding=args.fp32_embedding,
            relax_projection=relax_projection,
            new_pos_ids=args.new_pos_ids,
            ffn_type=args.ffn_type,
            hidden_dropout_prob=args.hidden_dropout_prob,
            attention_probs_dropout_prob=args.attention_probs_dropout_prob,
            num_qkv=args.num_qkv,
            seg_emb=args.seg_emb)
        global_step = 0
    else:
        if recover_step:
            logger.info("***** Recover model: %d *****", recover_step)
            model_recover = torch.load(os.path.join(
                args.output_dir, "model.{0}.bin".format(recover_step)),
                                       map_location='cpu')
            # recover_step == number of epochs
            global_step = math.floor(recover_step * t_total /
                                     args.num_train_epochs)
        elif args.model_recover_path:
            logger.info("***** Recover model: %s *****",
                        args.model_recover_path)
            model_recover = torch.load(args.model_recover_path,
                                       map_location='cpu')
            global_step = 0
        model = BertForPreTrainingLossMask.from_pretrained(
            args.bert_model,
            state_dict=model_recover,
            num_labels=cls_num_labels,
            num_rel=0,
            type_vocab_size=type_vocab_size,
            config_path=args.config_path,
            task_idx=3,
            num_sentlvl_labels=num_sentlvl_labels,
            max_position_embeddings=args.max_position_embeddings,
            label_smoothing=args.label_smoothing,
            fp32_embedding=args.fp32_embedding,
            relax_projection=relax_projection,
            new_pos_ids=args.new_pos_ids,
            ffn_type=args.ffn_type,
            hidden_dropout_prob=args.hidden_dropout_prob,
            attention_probs_dropout_prob=args.attention_probs_dropout_prob,
            num_qkv=args.num_qkv,
            seg_emb=args.seg_emb)
    if args.local_rank == 0:
        dist.barrier()

    if args.fp16:
        model.half()
        if args.fp32_embedding:
            model.bert.embeddings.word_embeddings.float()
            model.bert.embeddings.position_embeddings.float()
            model.bert.embeddings.token_type_embeddings.float()
    model.to(device)
    if args.local_rank != -1:
        try:
            from torch.nn.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("DistributedDataParallel")
        model = DDP(model,
                    device_ids=[args.local_rank],
                    output_device=args.local_rank,
                    find_unused_parameters=True)
    elif n_gpu > 1:
        # model = torch.nn.DataParallel(model)
        model = DataParallelImbalance(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            # from apex.optimizers import FP16_Optimizer
            from pytorch_pretrained_bert.optimization_fp16 import FP16_Optimizer_State
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer_State(optimizer,
                                             dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer_State(optimizer,
                                             static_loss_scale=args.loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    if recover_step:
        logger.info("***** Recover optimizer: %d *****", recover_step)
        optim_recover = torch.load(os.path.join(
            args.output_dir, "optim.{0}.bin".format(recover_step)),
                                   map_location='cpu')
        if hasattr(optim_recover, 'state_dict'):
            optim_recover = optim_recover.state_dict()
        optimizer.load_state_dict(optim_recover)
        if args.loss_scale == 0:
            logger.info("***** Recover optimizer: dynamic_loss_scale *****")
            optimizer.dynamic_loss_scale = True

    logger.info("***** CUDA.empty_cache() *****")
    torch.cuda.empty_cache()

    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", t_total)

        model.train()
        if recover_step:
            start_epoch = recover_step + 1
        else:
            start_epoch = 1
        for i_epoch in trange(start_epoch,
                              int(args.num_train_epochs) + 1,
                              desc="Epoch",
                              disable=args.local_rank not in (-1, 0)):
            if args.local_rank != -1:
                train_sampler.set_epoch(i_epoch)
            iter_bar = tqdm(train_dataloader,
                            desc='Iter (loss=X.XXX)',
                            disable=args.local_rank not in (-1, 0))
            for step, batch in enumerate(iter_bar):
                batch = [
                    t.to(device) if t is not None else None for t in batch
                ]
                if args.has_sentence_oracle:
                    input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, oracle_pos, oracle_weights, oracle_labels = batch
                else:
                    input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx = batch
                    oracle_pos, oracle_weights, oracle_labels = None, None, None
                loss_tuple = model(input_ids,
                                   segment_ids,
                                   input_mask,
                                   lm_label_ids,
                                   is_next,
                                   masked_pos=masked_pos,
                                   masked_weights=masked_weights,
                                   task_idx=task_idx,
                                   masked_pos_2=oracle_pos,
                                   masked_weights_2=oracle_weights,
                                   masked_labels_2=oracle_labels,
                                   mask_qkv=mask_qkv)
                masked_lm_loss, next_sentence_loss = loss_tuple
                if n_gpu > 1:  # mean() to average on multi-gpu.
                    # loss = loss.mean()
                    masked_lm_loss = masked_lm_loss.mean()
                    next_sentence_loss = next_sentence_loss.mean()
                loss = masked_lm_loss + next_sentence_loss

                # logging for each step (i.e., before normalization by args.gradient_accumulation_steps)
                iter_bar.set_description('Iter (loss=%5.3f)' % loss.item())

                # ensure that accumlated gradients are normalized
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                    if amp_handle:
                        amp_handle._clear_cache()
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    lr_this_step = args.learning_rate * \
                        warmup_linear(global_step/t_total,
                                      args.warmup_proportion)
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            # Save a trained model
            if (args.local_rank == -1 or torch.distributed.get_rank() == 0):
                logger.info(
                    "** ** * Saving fine-tuned model and optimizer ** ** * ")
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(
                    args.output_dir, "model.{0}.bin".format(i_epoch))
                torch.save(model_to_save.state_dict(), output_model_file)
                output_optim_file = os.path.join(
                    args.output_dir, "optim.{0}.bin".format(i_epoch))
                torch.save(optimizer.state_dict(), output_optim_file)

                logger.info("***** CUDA.empty_cache() *****")
                torch.cuda.empty_cache()
Exemple #54
0
    inp1 = torch.randn(num_workers, 3, resolution[0], resolution[1]).cuda()
    depth1 = torch.randn(num_workers, 1, resolution[0], resolution[1]).cuda()
    a_icm = torch.zeros(num_workers).long().cuda()

    for epoch in range(epochs):
        print("\nEpoch %d\n-------" % (epoch))
        loss_value_total = 0.0
        loss_policy_total = 0.0
        loss_entropy_total = 0.0
        loss_inverse_total = 0.0
        loss_forward_total = 0.0
        reward_intrinsic_total = 0.0

        print("Training...")
        model.train()
        for learning_step in trange(sequences_per_epoch, leave=False):
            loss = 0.0
            probs_list = []
            log_probs_list = []
            entropy_list = []
            value_list = []
            reward_list = []
            unfinished_list = []
            forward_start_time = time()
            for t in range(seq_len):
                inp, depth = prep_frames_batch(workers)
                (policy, value, hidden) = model(inp, hidden)
                probs = F.softmax(policy, 1)
                log_probs = F.log_softmax(policy, 1)
                a = probs.multinomial(num_samples=1).detach().squeeze(1)
                probs_list.append(probs[whole_batch, a])
Exemple #55
0
validation_data = TensorDataset(validation_inputs, validation_masks,
                                validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data,
                                   sampler=validation_sampler,
                                   batch_size=batch_size)

device = torch.device("cuda:0")  #if torch.cuda.is_available() else "cpu")

train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

    # Training

    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
Exemple #56
0
    # target_image = target_image.permute(2, 0, 1).unsqueeze(0)

    optimizer = torch.optim.SGD(network.parameters(), lr=train_rate, momentum=0.5)
    # optimizer = torch.optim.Adam(network.parameters(), lr=train_rate)

    epochs = 100
    validate_every = 10

    if args.inference:
        torch.autograd.set_grad_enabled(False)
        epochs = 1
        sim_duration = 25
        sim_dt = (1.0 / 60.0) / sim_substeps
        sim_steps = int(sim_duration / sim_dt)

    for e in trange(epochs):

        sim_time = 0.0

        state = model.state()

        loss = torch.zeros(1, requires_grad=True)
        # loss = None

        print_every = 60 * 16
        render_every = 60

        imgs = []

        for i in range(0, sim_steps):
Exemple #57
0
def start(worker, size, quality, folder, temp, preffix, suffix, upscale,
          downscale, copy_ud, cores):
    curr = ""
    d = open(f"{temp}/core{worker}.log", "w")
    d.close()
    d = open(f"{temp}/core{worker}.log", "a")
    try:
        images = get_data(worker, temp)
        bar = trange(
            len(images), leave=True, dynamic_ncols=True,
            ascii=True)  #, bar_format= "{l_bar}{bar}|{n_fmt}/{total_fmt}"
        terminalsize = 100**10
        for x in bar:
            if os.path.isfile("./stop.all"):
                break
            try:
                curr = images[x][1]
                img = Image.open(f"{folder}{images[x][0]}{images[x][2]}")
                if not os.path.isdir(folder + images[x][1]):
                    os.makedirs(folder + images[x][1])
                if (((img.size[0] > size or img.size[1] > size) and downscale)
                        or
                    ((img.size[0] < size and img.size[1] < size) and upscale)):
                    img.resize(
                        get_pos(img.size, size), resample=Image.BICUBIC
                    ).save(
                        f"{folder}{images[x][1]}{preffix}{images[x][3]}{suffix}{images[x][4]}",
                        quality=quality,
                        optimize=True)
                elif copy_ud:
                    img.save(
                        f"{folder}{images[x][1]}{preffix}{images[x][3]}{suffix}{images[x][4]}",
                        quality=quality,
                        optimize=True)
                img = None
                fa = open(f"{temp}/core{worker}.progress", "w")
                fa.write(str(x))
                fa.close()
                if os.get_terminal_size(
                )[0] < terminalsize or bar.ncols + 1 > os.get_terminal_size(
                )[0]:
                    terminalsize = os.get_terminal_size()[0]
                    bar.refresh()
                    os.system("cls")
                    if (cores == 1):
                        print("Starting..")
                        print("Appling...")
                        print("Ctrl+C to stop")
                elif os.get_terminal_size()[0] != terminalsize:
                    terminalsize = os.get_terminal_size()[0]
            except Exception as E:
                d.write("\n" + re.sub(
                    regex, subst,
                    f'{datetime.now().strftime("%d.%m.%Y %H:%M:%S")} [Error] > "{images[x][0]}{images[x][2]}" > {E}'
                ))
    except Exception as E:
        d = open(f"{temp}/core{worker}.log", "a")
        d.write(f"\n{datetime.now()} [Error] > {curr} > {E}")
        d.close()
        f = open(f"{temp}/core{worker}.data", "w+")
        f.close()
    d.close()
    f = open(f"{temp}/core{worker}.data", "w+")
    f.close()

# -----------
# Main
#
# This is where we will mostly be spawning workers, initializing networks, and plotting our rewards.
# -----------

if __name__ == "__main__":
    memory = Memory([], MEMORY_SIZE)
    model = QNetwork()
    epsilon = 1
    all_epsilons, all_steps, all_rewards = [], [], []
    env = gym.make("CartPole-v0")
    video_recorder = VideoRecorder(env, './output/00_Cartpole_Q_Learning_Discrete_Video.mp4', enabled=True)
    for i in trange(TOTAL_RUNTIME):
        total_reward = 0.0
        total_steps = 0
        state = env.reset()[2]
        while True:
            if (i+1) % 100 == 0:
                video_recorder.capture_frame()
            if i < STARTUP_SIZE:
                action = env.action_space.sample()
            else:
                if random.random() < epsilon:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(model(torch.from_numpy(angle_to_vector(state, N_STATES)).to(device)).cpu().detach().numpy())
            next_state, reward, done, _ = env.step(action)
            next_state = next_state[2]
Exemple #59
0
    def train(self):
        x_list, xs, ys, sample_list = self.batch_manager.random_list(
            self.b_num)
        save_image(xs, '{}/x_gt.png'.format(self.model_dir))
        save_image(ys, '{}/y_gt.png'.format(self.model_dir))

        with open('{}/gt.txt'.format(self.model_dir), 'w') as f:
            for sample in sample_list:
                f.write(sample + '\n')

        # call once
        summary_once = self.sess.run(self.summary_once)
        self.summary_writer.add_summary(summary_once, 0)
        self.summary_writer.flush()

        for step in trange(self.start_step, self.max_step):
            fetch_dict = {
                "optim": self.optim,
                "loss": self.loss,
            }

            if step % self.log_step == 0 or step == self.max_step - 1:
                fetch_dict.update({
                    "summary": self.summary_op,
                })

            if step % self.test_step == self.test_step - 1 or step == self.max_step - 1:
                l1, l2, iou, nb = 0, 0, 0, 0
                for x, y in self.batch_manager.test_batch():
                    if self.data_format == 'NCHW':
                        x = to_nchw_numpy(x)
                        y = to_nchw_numpy(y)
                    tl1, tl2, y_ = self.sess.run(
                        [self.tl1, self.tl2, self.yt_], {
                            self.xt: x,
                            self.yt: y
                        })
                    l1 += tl1
                    l2 += tl2
                    nb += 1

                    # iou
                    y_I = np.logical_and(y > 0, y_ > 0)
                    y_I_sum = np.sum(y_I, axis=(1, 2, 3))
                    y_U = np.logical_or(y > 0, y_ > 0)
                    y_U_sum = np.sum(y_U, axis=(1, 2, 3))
                    # print(y_I_sum, y_U_sum)
                    nonzero_id = np.where(y_U_sum != 0)[0]
                    if nonzero_id.shape[0] == 0:
                        acc = 1.0
                    else:
                        acc = np.average(y_I_sum[nonzero_id] /
                                         y_U_sum[nonzero_id])
                    iou += acc

                    if nb > 500:
                        break

                l1 /= float(nb)
                l2 /= float(nb)
                iou /= float(nb)

                summary_test = self.sess.run(
                    self.summary_test, {
                        self.test_acc_l1: l1,
                        self.test_acc_l2: l2,
                        self.test_acc_iou: iou
                    })
                self.summary_writer.add_summary(summary_test, step)
                self.summary_writer.flush()

            result = self.sess.run(fetch_dict)

            if step % self.log_step == 0 or step == self.max_step - 1:
                self.summary_writer.add_summary(result['summary'], step)
                self.summary_writer.flush()

                loss = result['loss']
                assert not np.isnan(loss), 'Model diverged with loss = NaN'

                print("\n[{}/{}] Loss: {:.6f}".format(step, self.max_step,
                                                      loss))

            if step % (self.log_step * 10) == 0 or step == self.max_step - 1:
                self.generate(x_list, self.model_dir, idx=step)

            if step % self.lr_update_step == self.lr_update_step - 1:
                self.sess.run(self.lr_update)

        # save last checkpoint..
        save_path = os.path.join(self.model_dir, 'model.ckpt')
        self.saver.save(self.sess, save_path, global_step=self.step)
        self.batch_manager.stop_thread()
def main():
    parser = ArgumentParser()
    parser.add_argument('--train_corpus', type=str, default='./datasets/unlabel/device-service-train-merge.txt', required=False, help="sentence in each line.")
    parser.add_argument("--output_dir", type=str, default='./datasets/unlabel/device-service-rel', required=False)
    parser.add_argument("--bert_model", type=str, default='bert-base-uncased', required=False,
                        choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
                                 "bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased"])
    parser.add_argument("--do_lower_case", default=True)  
    parser.add_argument("--do_whole_word_mask", default=True, 
                        help="Whether to use whole word masking rather than per-WordPiece masking.")

    parser.add_argument("--num_workers", type=int, default=1,
                        help="The number of workers to use to write the files")
    parser.add_argument("--epochs_to_generate", type=int, default=1, 
                        help="Number of epochs of data to pregenerate")
    parser.add_argument("--max_seq_len", type=int, default=100) 
    parser.add_argument("--short_seq_prob", type=float, default=0.1,
                        help="Probability of making a short sentence as a training example")
    parser.add_argument("--masked_lm_prob", type=float, default=0.20,
                        help="Probability of masking each token for the LM task")
    parser.add_argument("--max_predictions_per_seq", type=int, default=25,
                        help="Maximum number of tokens to mask in each sequence")

    args = parser.parse_args()
 
    with open('./datasets/tag_vocab.txt', 'r', encoding='utf-8') as fp:
        tag_vocab = fp.read().splitlines()
        tag_vocab = [l.strip() for l in tag_vocab]
    args.tag_vocab = tag_vocab

    tokenizer = SubTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    vocab_list = list(tokenizer.vocab.keys())
    nlp = spacy.load("en_core_web_sm")
    all_tags = set()
    all_rel = set()
    with DocumentDatabase() as docs:
        for file in glob.glob(args.train_corpus):
            with open(file, 'r', encoding='utf-8') as f:
                doc = []  # token
                tag = []  # pos tags
                head = []  # head index
                arc_label = []  # dependency relation
                domain_label = []  # domain label
                for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
                    line = line.strip().lower() if args.do_lower_case else line.strip()
                    if len(line) == 1:
                        if len(doc): 
                            docs.add_document(doc, tag, head, arc_label, domain_label)
                        doc = []
                        tag = []
                        head = []
                        arc_label = []
                        domain_label = []
                    else:
                        domain, line = line.split('***')[:2]
                        nlp_doc = nlp(line)
                        tokens = [t.text for t in nlp_doc]
                        token_tags = [t.tag_ for t in nlp_doc]  
                        token_head = parse_tree(nlp_doc)  
                        token_dep_rel = [t.dep_ for t in nlp_doc]
                        all_rel.update(token_dep_rel)
                        tokens, token_tags, token_head_index, token_dep_rel, _ = tokenizer.subword_tokenize(tokens,
                                                                                                            token_tags,
                                                                                                            token_head,
                                                                                                            token_dep_rel)
                        all_tags.update(token_tags)
                        
                        assert len(tokens) == len(token_tags) == len(token_head_index) == len(token_dep_rel)
                        doc.append(tokens)
                        tag.append(token_tags)
                        head.append(token_head_index)
                        arc_label.append(token_dep_rel)
                        domain_label.append(domain)
                if doc:
                    docs.add_document(doc, tag, head, arc_label,
                                      domain_label)  # If the last doc didn't end on a newline, make sure it still gets added
        if len(docs) < 1:
            exit("ERROR: No document breaks were found in the input file!")
        if not os.path.exists(args.output_dir):
            os.mkdir(args.output_dir)

        if args.num_workers > 1:
            writer_workers = Pool(min(args.num_workers, args.epochs_to_generate))
            arguments = [(docs, vocab_list, args, idx) for idx in range(args.epochs_to_generate)]
            writer_workers.starmap(create_training_file, arguments)
        else:
            for epoch in trange(args.epochs_to_generate, desc="Epoch"):
                create_training_file(docs, vocab_list, args, epoch)