def run(lam,gam,tau,ITERS,attack_years = [], savetail='', pmitail = ''): r = 50 # rank b = nw # batch size emph = 1 # emphasize the nonzero savefile = savehead+'L'+str(lam)+'T'+str(tau)+'G'+str(gam)+'A'+str(emph) if len(savetail) > 0: savetail = '_' + savetail if len(pmitail) > 0: pmitail = '_' + pmitail savefile = savefile + savetail embfilename = 'data/emb_static%s' % savetail print savefile, embfilename try: e = sio.loadmat(embfilename)['emb'] except(IOError): print 'file not available yet', embfilename return print('starting training with following parameters') print_params(r,lam,tau,gam,emph,ITERS) print('there are a total of {} words, and {} time points'.format(nw,T)) print('X*X*X*X*X*X*X*X*X') print('initializing') Ulist = [copy.deepcopy(e) for x in T] Vlist = [copy.deepcopy(e) for x in T] del e print('getting batch indices') if b < nw: b_ind = util.getbatches(nw,b) else: b_ind = [range(nw)] import time start_time = time.time() # sequential updates for iteration in xrange(ITERS): print_params(r,lam,tau,gam,emph,ITERS) try: Ulist = pickle.load(open( "%sngU_iter%d.p" % (savefile,iteration), "rb" ) ) Vlist = pickle.load(open( "%sngV_iter%d.p" % (savefile, iteration), "rb" ) ) print 'iteration %d loaded succesfully' % iteration continue except(IOError): pass # shuffle times if iteration == 0: times = T else: times = np.random.permutation(T) for t in xrange(len(times)): # select a time print 'iteration %d, time %d' % (iteration, t) if T[t] in attack_years: f = 'data/wordPMI_%d%s.mat' % (T[t],pmitail) else: f = 'data/wordPMI_' + str(T[t]) + '.mat' pmi = sio.loadmat(f)['pmi'] for j in xrange(len(b_ind)): # select a mini batch print '%d out of %d' % (j,len(b_ind)) ind = b_ind[j] ## UPDATE V # get data pmi_seg = pmi[:,ind].todense() if t==0: vp = np.zeros((len(ind),r)) up = np.zeros((len(ind),r)) iflag = True else: vp = Vlist[t-1][ind,:] up = Ulist[t-1][ind,:] iflag = False if t==len(T)-1: vn = np.zeros((len(ind),r)) un = np.zeros((len(ind),r)) iflag = True else: vn = Vlist[t+1][ind,:] un = Ulist[t+1][ind,:] iflag = False Vlist[t][ind,:] = util.update(Ulist[t],emph*pmi_seg,vp,vn,lam,tau,gam,ind,iflag) Ulist[t][ind,:] = util.update(Vlist[t],emph*pmi_seg,up,un,lam,tau,gam,ind,iflag) #### INNER BATCH LOOP END # save print 'time elapsed = ', time.time()-start_time pickle.dump(Ulist, open( "%sngU_iter%d.p" % (savefile,iteration), "wb" ) , pickle.HIGHEST_PROTOCOL) pickle.dump(Vlist, open( "%sngV_iter%d.p" % (savefile, iteration), "wb" ) , pickle.HIGHEST_PROTOCOL)
def run(lam, gam, tau, ITERS, attack_years=[], savetail="", pmitail=""): r = 50 # rank b = nw # batch size emph = 1 # emphasize the nonzero savefile = (savehead + "L" + str(lam) + "T" + str(tau) + "G" + str(gam) + "A" + str(emph)) if len(savetail) > 0: savetail = "_" + savetail if len(pmitail) > 0: pmitail = "_" + pmitail savefile = savefile + savetail embfilename = "data/emb_static%s.mat" % savetail print(savefile, embfilename) try: e = sio.loadmat(embfilename)["emb"] except (IOError): print("file not available yet", embfilename) return print("starting training with following parameters") print_params(r, lam, tau, gam, emph, ITERS) print("there are a total of {} words, and {} time points".format(nw, T)) print("X*X*X*X*X*X*X*X*X") print("initializing") Ulist = [copy.deepcopy(e) for x in T] Vlist = [copy.deepcopy(e) for x in T] del e print("getting batch indices") if b < nw: b_ind = util.getbatches(nw, b) else: b_ind = [range(nw)] import time start_time = time.time() # sequential updates for iteration in range(ITERS): print_params(r, lam, tau, gam, emph, ITERS) try: Ulist = pickle.load( open("%sngU_iter%d.p" % (savefile, iteration), "rb")) Vlist = pickle.load( open("%sngV_iter%d.p" % (savefile, iteration), "rb")) print("iteration %d loaded succesfully" % iteration) continue except (IOError): pass # shuffle times if iteration == 0: times = T else: times = np.random.permutation(T) for t in range(len(times)): # select a time print("iteration %d, time %d" % (iteration, t)) if T[t] in attack_years: # file = "data/wordPMI_%d%s.mat" % (T[t], pmitail) file = "data/emb_{}{}.mat".format(T[t], pmitail) else: # file = "data/wordPMI_" + str(T[t]) + ".mat" file = "data/emb_{}.mat".format(T[t]) pmi = sio.loadmat(file)["pmi"] for j in range(len(b_ind)): # select a mini batch print("%d out of %d" % (j, len(b_ind))) ind = b_ind[j] ## UPDATE V # get data pmi_seg = pmi[:, ind].todense() if t == 0: vp = np.zeros((len(ind), r)) up = np.zeros((len(ind), r)) iflag = True else: vp = Vlist[t - 1][ind, :] up = Ulist[t - 1][ind, :] iflag = False if t == len(T) - 1: vn = np.zeros((len(ind), r)) un = np.zeros((len(ind), r)) iflag = True else: vn = Vlist[t + 1][ind, :] un = Ulist[t + 1][ind, :] iflag = False Vlist[t][ind, :] = util.update(Ulist[t], emph * pmi_seg, vp, vn, lam, tau, gam, ind, iflag) Ulist[t][ind, :] = util.update(Vlist[t], emph * pmi_seg, up, un, lam, tau, gam, ind, iflag) #### INNER BATCH LOOP END # save print("time elapsed = ", time.time() - start_time) pickle.dump( Ulist, open("%sngU_iter%d.p" % (savefile, iteration), "wb"), pickle.HIGHEST_PROTOCOL, ) pickle.dump( Vlist, open("%sngV_iter%d.p" % (savefile, iteration), "wb"), pickle.HIGHEST_PROTOCOL, )
print('starting training with following parameters') print_params(r, lam, tau, gam, emph, ITERS) print('there are a total of {} words, and {} time points'.format(nw, T)) print('X*X*X*X*X*X*X*X*X') print('initializing') #Ulist,Vlist = util.initvars(nw,T,r, trainhead) Ulist, Vlist = util.import_static_init(T) print(Ulist) print(Vlist) #asdf print('getting batch indices') if b < nw: b_ind = util.getbatches(nw, b) else: b_ind = [list(range(nw))] import time start_time = time.time() # sequential updates for iteration in range(ITERS): print_params(r, lam, tau, gam, emph, ITERS) try: Ulist = pickle.load( open("%sngU_iter%d.p" % (savefile, iteration), "rb")) Vlist = pickle.load( open("%sngV_iter%d.p" % (savefile, iteration), "rb")) print('iteration %d loaded succesfully' % iteration) continue
def run_dw2v(exper_dir, iters, lam, tau, gam, emph, r): import DynamicWord2Vec.train_model.util_timeCD as util data_dir = os.path.join(exper_dir, 'embs') trainhead = os.path.join(data_dir, 'wordPairPMI_') voc_path = os.path.join(data_dir, 'wordIDHash.csv') embmat = os.path.join(data_dir, 'emb_static.mat') b = nw = get_vocabulary_size(voc_path) ITERS = iters T = get_points(data_dir) savehead = os.path.join(exper_dir, 'results') savefile = 'L{}T{}G{}A{}'.format(lam, tau, gam, emph) savefile = os.path.join(savehead, savefile) print(f"b = {b}, nw = {nw}, T = {T}") print(iters, lam, tau, gam, emph, r) print('starting training with following parameters') print_params(r, lam, tau, gam, emph, ITERS) print('there are a total of {} words, and {} time points'.format(nw, T)) print('X*X*X*X*X*X*X*X*X') print('initializing') #Ulist,Vlist = util.initvars(nw,T,r, trainhead) Ulist, Vlist = util.import_static_init(T, embmat) print(Ulist) print(Vlist) print('getting batch indices') if b < nw: b_ind = util.getbatches(nw, b) else: b_ind = [list(range(nw))] import time start_time = time.time() # sequential updates for iteration in range(ITERS): print_params(r, lam, tau, gam, emph, ITERS) try: Ulist = pickle.load( open("%sngU_iter%d.p" % (savefile, iteration), "rb")) Vlist = pickle.load( open("%sngV_iter%d.p" % (savefile, iteration), "rb")) print('iteration %d loaded succesfully' % iteration) continue except (IOError): pass loss = 0 # shuffle times if iteration == 0: times = T else: times = np.random.permutation(T) for t in range(len(times)): # select a time print('iteration %d, time %d' % (iteration, t)) f = trainhead + str(T[t]) + '.csv' print(f) """ try: Ulist = pickle.load( open( "%sngU_iter%d_time%d_tmp.p" % (savefile,iteration,t), "rb" ) ) Vlist = pickle.load( open( "%sngV_iter%d_time%d_tmp.p" % (savefile, iteration,t), "rb" ) ) times = pickle.load( open( "%sngtimes_iter%d_time%d_tmp.p" % (savefile, iteration,t), "rb" ) ) print 'iteration %d time %d loaded succesfully' % (iteration, t) continue except(IOError): pass """ pmi = util.getmat(f, nw, False) for j in range(len(b_ind)): # select a mini batch print('%d out of %d' % (j, len(b_ind))) ind = b_ind[j] ## UPDATE V # get data pmi_seg = pmi[:, ind].todense() if t == 0: vp = np.zeros((len(ind), r)) up = np.zeros((len(ind), r)) iflag = True else: vp = Vlist[t - 1][ind, :] up = Ulist[t - 1][ind, :] iflag = False if t == len(T) - 1: vn = np.zeros((len(ind), r)) un = np.zeros((len(ind), r)) iflag = True else: vn = Vlist[t + 1][ind, :] un = Ulist[t + 1][ind, :] iflag = False Vlist[t][ind, :] = util.update(Ulist[t], emph * pmi_seg, vp, vn, lam, tau, gam, ind, iflag) Ulist[t][ind, :] = util.update(Vlist[t], emph * pmi_seg, up, un, lam, tau, gam, ind, iflag) #pickle.dump(Ulist, open( "%sngU_iter%d_time%d_tmp.p" % (savefile,iteration,t), "wb" ) , pickle.HIGHEST_PROTOCOL) #pickle.dump(Vlist, open( "%sngV_iter%d_time%d_tmp.p" % (savefile, iteration,t), "wb" ) , pickle.HIGHEST_PROTOCOL) #pickle.dump(times, open( "%sngtimes_iter%d_time%d_tmp.p" % (savefile, iteration,t), "wb" ) , pickle.HIGHEST_PROTOCOL) #### INNER BATCH LOOP END # save print('time elapsed = ', time.time() - start_time) pickle.dump(Ulist, open("%sngU_iter%d.p" % (savefile, iteration), "wb"), pickle.HIGHEST_PROTOCOL) pickle.dump(Vlist, open("%sngV_iter%d.p" % (savefile, iteration), "wb"), pickle.HIGHEST_PROTOCOL)