def normalize_emb(emb, method): """ Normalize input embedding based on the choice of method """ print(f"Normalizing using {method}") if method == 'unit': emb = embeddings.length_normalize(emb) elif method == 'center': emb = embeddings.mean_center(emb) elif method == 'unitdim': emb = embeddings.length_normalize_dimensionwise(emb) elif method == 'centeremb': emb = embeddings.mean_center_embeddingwise(emb) return emb
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Normalize word embeddings') parser.add_argument( 'actions', choices=['none', 'unit', 'center', 'unitdim', 'centeremb'], nargs='+', help='the actions to perform in order') parser.add_argument( '-i', '--input', default=sys.stdin.fileno(), help='the input word embedding file (defaults to stdin)') parser.add_argument( '-o', '--output', default=sys.stdout.fileno(), help='the output word embedding file (defaults to stdout)') parser.add_argument( '--encoding', default='utf-8', action='store_true', help='the character encoding for input/output (defaults to utf-8)') args = parser.parse_args() # Read input embeddings f = open(args.input, encoding=args.encoding, errors='surrogateescape') words, matrix = embeddings.read(f) # Perform normalization actions for action in args.actions: if action == 'unit': matrix = embeddings.length_normalize(matrix) elif action == 'center': matrix = embeddings.mean_center(matrix) elif action == 'unitdim': matrix = embeddings.length_normalize_dimensionwise(matrix) elif action == 'centeremb': matrix = embeddings.mean_center_embeddingwise(matrix) # Write normalized embeddings f = open(args.output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(words, matrix, f)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Map the source embeddings into the target embedding space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('--model_path', default=None, type=str, help='directory to save the model') parser.add_argument('--geomm_embeddings_path', default=None, type=str, help='directory to save the output GeoMM latent space embeddings. The output embeddings are normalized.') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--max_vocab', default=0,type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary') parser.add_argument('--verbose', default=0,type=int, help='Verbose') mapping_group = parser.add_argument_group('mapping arguments', 'Basic embedding mapping arguments') mapping_group.add_argument('-dtrain', '--dictionary_train', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument('-dtest', '--dictionary_test', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') geomm_group = parser.add_argument_group('GeoMM arguments', 'Arguments for GeoMM method') geomm_group.add_argument('--l2_reg', type=float,default=1e2, help='Lambda for L2 Regularization') geomm_group.add_argument('--max_opt_time', type=int,default=5000, help='Maximum time limit for optimization in seconds') geomm_group.add_argument('--max_opt_iter', type=int,default=150, help='Maximum number of iterations for optimization') eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation') eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time') eval_group.add_argument('--eval_batch_size', type=int,default=1000, help='Batch size for evaluation') eval_group.add_argument('--csls_neighbourhood', type=int,default=10, help='Neighbourhood size for CSLS') args = parser.parse_args() BATCH_SIZE = args.eval_batch_size ## Logging #method_name = os.path.join('logs','geomm') #directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) #if not os.path.exists(directory): # os.makedirs(directory) #log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train)) #log_file_name = log_file_name + '.log' #class Logger(object): # def __init__(self): # self.terminal = sys.stdout # self.log = open(os.path.join(directory,log_file_name), "a") # def write(self, message): # self.terminal.write(message) # self.log.write(message) # def flush(self): # #this flush method is needed for python 3 compatibility. # #this handles the flush command by doing nothing. # #you might want to specify some extra behavior here. # pass #sys.stdout = Logger() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading train data...') # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile,max_voc=args.max_vocab, dtype=dtype) trg_words, z = embeddings.read(trgfile,max_voc=args.max_vocab, dtype=dtype) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary noov=0 src_indices = [] trg_indices = [] f = open(args.dictionary_train, encoding=args.encoding, errors='surrogateescape') for line in f: src,trg = line.split() if args.max_vocab: src=src.lower() trg=trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: noov+=1 if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg)) #, file=sys.stderr f.close() if args.verbose: print('Number of training pairs having at least one OOV: {}'.format(noov)) src_indices = src_indices trg_indices = trg_indices if args.verbose: print('Normalizing embeddings...') # STEP 0: Normalization for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) # Step 1: Optimization if args.verbose: print('Beginning Optimization') start_time = time.time() x_count = len(set(src_indices)) z_count = len(set(trg_indices)) A = np.zeros((x_count,z_count)) # Creating dictionary matrix from training set map_dict_src={} map_dict_trg={} I=0 uniq_src=[] uniq_trg=[] for i in range(len(src_indices)): if src_indices[i] not in map_dict_src.keys(): map_dict_src[src_indices[i]]=I I+=1 uniq_src.append(src_indices[i]) J=0 for j in range(len(trg_indices)): if trg_indices[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices[j]]=J J+=1 uniq_trg.append(trg_indices[j]) for i in range(len(src_indices)): A[map_dict_src[src_indices[i]],map_dict_trg[trg_indices[i]]]=1 np.random.seed(0) Lambda=args.l2_reg U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() Kx, Kz = x[uniq_src], z[uniq_trg] XtAZ = Kx.T.dot(A.dot(Kz)) XtX = Kx.T.dot(Kx) ZtZ = Kz.T.dot(Kz) # AA = np.sum(A*A) # this can be added if cost needs to be compared to original geomm W = (U1.dot(B)).dot(U2.T) regularizer = 0.5*Lambda*(TT.sum(B**2)) sXtX = shared(XtX) sZtZ = shared(ZtZ) sXtAZ = shared(XtAZ) cost = regularizer wtxtxw = W.T.dot(sXtX.dot(W)) wtxtxwztz = wtxtxw.dot(sZtZ) cost += TT.nlinalg.trace(wtxtxwztz) cost += -2 * TT.sum(W * sXtAZ) # cost += shared(AA) # this can be added if cost needs to be compared with original geomm solver = ConjugateGradient(maxtime=args.max_opt_time,maxiter=args.max_opt_iter) manifold =Product([Stiefel(x.shape[1], x.shape[1]),Stiefel(z.shape[1], x.shape[1]),PositiveDefinite(x.shape[1])]) #manifold =Product([Stiefel(x.shape[1], 200),Stiefel(z.shape[1], 200),PositiveDefinite(200)]) problem = Problem(manifold=manifold, cost=cost, arg=[U1,U2,B], verbosity=3) wopt = solver.solve(problem) w= wopt U1 = w[0] U2 = w[1] B = w[2] ### Save the models if requested if args.model_path is not None: os.makedirs(args.model_path,exist_ok=True) np.savetxt('{}/U_src.csv'.format(args.model_path),U1) np.savetxt('{}/U_tgt.csv'.format(args.model_path),U2) np.savetxt('{}/B.csv'.format(args.model_path),B) # Step 2: Transformation xw = x.dot(U1).dot(scipy.linalg.sqrtm(B)) zw = z.dot(U2).dot(scipy.linalg.sqrtm(B)) end_time = time.time() if args.verbose: print('Completed training in {0:.2f} seconds'.format(end_time-start_time)) gc.collect() ### Save the GeoMM embeddings if requested xw_n = embeddings.length_normalize(xw) zw_n = embeddings.length_normalize(zw) if args.geomm_embeddings_path is not None: os.makedirs(args.geomm_embeddings_path,exist_ok=True) out_emb_fname=os.path.join(args.geomm_embeddings_path,'src.vec') with open(out_emb_fname,'w',encoding=args.encoding) as outfile: embeddings.write(src_words,xw_n,outfile) out_emb_fname=os.path.join(args.geomm_embeddings_path,'trg.vec') with open(out_emb_fname,'w',encoding=args.encoding) as outfile: embeddings.write(trg_words,zw_n,outfile) # Step 3: Evaluation if args.normalize_eval: xw = xw_n zw = zw_n X = xw[src_indices] Z = zw[trg_indices] # Loading test dictionary f = open(args.dictionary_test, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src=src.lower() trg=trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) ### compute nearest neigbours of x in z t=time.time() nbrhood_x=np.zeros(xw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_x = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1) nbrhood_x[src[i:j]]=np.mean(similarities_x[:,:args.csls_neighbourhood],axis=1) ### compute nearest neigbours of z in x (GPU version) nbrhood_z=np.zeros(zw.shape[0]) with cp.cuda.Device(0): nbrhood_z2=cp.zeros(zw.shape[0]) batch_num=1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1*cp.partition(-1*cp.dot(cp.asarray(zw[i:j]),cp.transpose(cp.asarray(xw))),args.csls_neighbourhood-1 ,axis=1)[:,:args.csls_neighbourhood] nbrhood_z2[i:j]=(cp.mean(similarities[:,:args.csls_neighbourhood],axis=1)) batch_num+=1 nbrhood_z=cp.asnumpy(nbrhood_z2) #### compute nearest neigbours of z in x (CPU version) #nbrhood_z=np.zeros(zw.shape[0]) #for i in range(0, len(zw.shape[0]), BATCH_SIZE): # j = min(i + BATCH_SIZE, len(zw.shape[0])) # similarities = zw[i:j].dot(xw.T) # similarities_z = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1) # nbrhood_z[i:j]=np.mean(similarities_z[:,:args.csls_neighbourhood],axis=1) #### find translation #for i in range(0, len(src), BATCH_SIZE): # j = min(i + BATCH_SIZE, len(src)) # similarities = xw[src[i:j]].dot(zw.T) # similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z # nn = similarities.argmax(axis=1).tolist() # similarities = np.argsort((similarities),axis=1) # nn5 = (similarities[:,-5:]) # nn10 = (similarities[:,-10:]) # for k in range(j-i): # translation[src[i+k]] = nn[k] # translation5[src[i+k]] = nn5[k] # translation10[src[i+k]] = nn10[k] #if args.geomm_embeddings_path is not None: # delim=',' # os.makedirs(args.geomm_embeddings_path,exist_ok=True) # translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv') # with open(translations_fname,'w',encoding=args.encoding) as translations_file: # for src_id in src: # src_word = src_words[src_id] # all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ] # trgout_words = [ trg_words[j] for j in translation10[src_id] ] # ss = list(nn10[src_id,:]) # # p1 = ':'.join(all_trg_words) # p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] ) # translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, delim=delim, p1=p1, p2=p2) ) ### find translation (and write to file if output requested) delim=',' translations_file =None if args.geomm_embeddings_path is not None: os.makedirs(args.geomm_embeddings_path,exist_ok=True) translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv') translations_file = open(translations_fname,'w',encoding=args.encoding) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z nn = similarities.argmax(axis=1).tolist() similarities = np.argsort((similarities),axis=1) nn5 = (similarities[:,-5:]) nn10 = (similarities[:,-10:]) for k in range(j-i): translation[src[i+k]] = nn[k] translation5[src[i+k]] = nn5[k] translation10[src[i+k]] = nn10[k] if args.geomm_embeddings_path is not None: src_id=src[i+k] src_word = src_words[src_id] all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ] trgout_words = [ trg_words[j] for j in translation10[src_id] ] #ss = list(nn10[src_id,:]) p1 = ':'.join(all_trg_words) p2 = ':'.join(trgout_words) #p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] ) translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, p1=p1, p2=p2, delim=delim) ) if args.geomm_embeddings_path is not None: translations_file.close() accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean=0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean+=1 break mean/=len(src) accuracy5 = mean mean=0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean+=1 break mean/=len(src) accuracy10 = mean message = src_input.split(".")[-2] + "-->" + trg_input.split(".")[-2] + ":" 'Coverage:{0:7.2%} Accuracy:{1:7.2%}'.format(coverage, accuracy)
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)') mapping_group.add_argument( '-d', '--dictionary', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument( '-c', '--orthogonal', dest='orthogonal', action='store_true', help='use orthogonal constrained mapping (default)') mapping_group.add_argument('-u', '--unconstrained', dest='orthogonal', action='store_false', help='use unconstrained mapping') parser.set_defaults(orthogonal=True) self_learning_group = parser.add_argument_group( 'self-learning arguments', 'Optional arguments for self-learning (ACL 2017)') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument( '--direction', choices=['forward', 'backward', 'union'], default='forward', help='the direction for dictionary induction (defaults to forward)') self_learning_group.add_argument( '--numerals', action='store_true', help= 'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary' ) self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--validation', default=None, help='a dictionary file for validation at each iteration') self_learning_group.add_argument( '--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile) trg_words, z = embeddings.read(trgfile) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary src_indices = [] trg_indices = [] if args.numerals: if args.dictionary != sys.stdin.fileno(): print('WARNING: Using numerals instead of the training dictionary', file=sys.stderr) numeral_regex = re.compile('^[0-9]+$') src_numerals = { word for word in src_words if numeral_regex.match(word) is not None } trg_numerals = { word for word in trg_words if numeral_regex.match(word) is not None } numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: pass oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Normalize embeddings for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) # Training loop prev_objective = objective = -100. it = 1 t = time.time() while it == 1 or objective - prev_objective >= args.threshold: # Update the embedding mapping if args.orthogonal: # orthogonal mapping u, s, vt = np.linalg.svd(np.dot(z[trg_indices].T, x[src_indices])) w = np.dot(vt.T, u.T) else: # unconstrained mapping x_pseudoinv = np.dot( np.linalg.inv(np.dot(x[src_indices].T, x[src_indices])), x[src_indices].T) w = np.dot(x_pseudoinv, z[trg_indices]) xw = x.dot(w) # Self-learning if args.self_learning: # Update the training dictionary best_sim_forward = np.full(x.shape[0], -100.) src_indices_forward = range(x.shape[0]) trg_indices_forward = np.zeros(x.shape[0], dtype=int) best_sim_backward = np.full(z.shape[0], -100.) src_indices_backward = np.zeros(z.shape[0], dtype=int) trg_indices_backward = range(z.shape[0]) for i in range(0, x.shape[0], MAX_DIM_X): for j in range(0, z.shape[0], MAX_DIM_Z): sim = xw[i:i + MAX_DIM_X].dot(z[j:j + MAX_DIM_Z].T) for k in range(sim.shape[0]): l = sim[k].argmax() if sim[k, l] > best_sim_forward[i + k]: best_sim_forward[i + k] = sim[k, l] trg_indices_forward[i + k] = j + l if args.direction in ( 'backward', 'union'): # Slow, only do if necessary for l in range(sim.shape[1]): k = sim[:, l].argmax() if sim[k, l] > best_sim_backward[j + l]: best_sim_backward[j + l] = sim[k, l] src_indices_backward[j + l] = i + k sim = None if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = np.concatenate( (src_indices_forward, src_indices_backward)) trg_indices = np.concatenate( (trg_indices_forward, trg_indices_backward)) # Objective function evaluation prev_objective = objective if args.direction == 'forward': objective = np.mean(best_sim_forward) elif args.direction == 'backward': objective = np.mean(best_sim_backward) elif args.direction == 'union': objective = (np.mean(best_sim_forward) + np.mean(best_sim_backward)) / 2 # Accuracy and similarity evaluation in validation if args.validation is not None: accuracy = np.mean([ 1 if trg_indices_forward[src] in trg else 0 for src, trg in validation.items() ]) similarity = np.mean([ np.max(z[list(trg)].dot(xw[src])) for src, trg in validation.items() ]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format( 100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format( it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, z, trgfile) srcfile.close() trgfile.close()
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Map the source embeddings into the target embedding space') parser.add_argument('emb_file', help='the input target embeddings') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--max_vocab', default=0,type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary') parser.add_argument('--verbose', default=0,type=int, help='Verbose') mapping_group = parser.add_argument_group('mapping arguments', 'Basic embedding mapping arguments') mapping_group.add_argument('-dtrain_file', '--dictionary_train_file', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument('-dtest_file', '--dictionary_test_file', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') geomm_group = parser.add_argument_group('GeoMM Multi arguments', 'Arguments for GeoMM Multi method') geomm_group.add_argument('--l2_reg', type=float,default=1e3, help='Lambda for L2 Regularization') geomm_group.add_argument('--max_opt_time', type=int,default=5000, help='Maximum time limit for optimization in seconds') geomm_group.add_argument('--max_opt_iter', type=int,default=150, help='Maximum number of iterations for optimization') eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation') eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time') eval_group.add_argument('--eval_batch_size', type=int,default=1000, help='Batch size for evaluation') eval_group.add_argument('--csls_neighbourhood', type=int,default=10, help='Neighbourhood size for CSLS') args = parser.parse_args() BATCH_SIZE = args.eval_batch_size # Logging method_name = os.path.join('logs','geomm_multi') directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) if not os.path.exists(directory): os.makedirs(directory) log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train_file)) log_file_name = log_file_name + '.log' class Logger(object): def __init__(self): self.terminal = sys.stdout self.log = open(os.path.join(directory,log_file_name), "a") def write(self, message): self.terminal.write(message) self.log.write(message) def flush(self): #this flush method is needed for python 3 compatibility. #this handles the flush command by doing nothing. #you might want to specify some extra behavior here. pass sys.stdout = Logger() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading train data...') words = [] emb = [] with open(args.emb_file, encoding=args.encoding, errors='surrogateescape') as f: for line in f: srcfile = open(line.strip(), encoding=args.encoding, errors='surrogateescape') words_temp, x_temp = embeddings.read(srcfile,max_voc=args.max_vocab, dtype=dtype) words.append(words_temp) emb.append(x_temp) # Build word to index map word2ind = [] for lang in words: word2ind.append({word: i for i, word in enumerate(lang)}) # Build training dictionary train_pairs = [] with open(args.dictionary_train_file, encoding=args.encoding, errors='surrogateescape') as ff: for line in ff: vals = line.split(',') curr_dict=[int(vals[0].strip()),int(vals[1].strip())] src_indices = [] trg_indices = [] with open(vals[2].strip(), encoding=args.encoding, errors='surrogateescape') as f: for line in f: src,trg = line.split() if args.max_vocab: src=src.lower() trg=trg.lower() try: src_ind = word2ind[curr_dict[0]][src] trg_ind = word2ind[curr_dict[1]][trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) curr_dict.append(src_indices) curr_dict.append(trg_indices) train_pairs.append(curr_dict) if args.verbose: print('Normalizing embeddings...') # Step 0: Normalization for action in args.normalize: if action == 'unit': for i in range(len(emb)): emb[i] = embeddings.length_normalize(emb[i]) elif action == 'center': for i in range(len(emb)): emb[i] = embeddings.mean_center(emb[i]) elif action == 'unitdim': for i in range(len(emb)): emb[i] = embeddings.length_normalize_dimensionwise(emb[i]) elif action == 'centeremb': for i in range(len(emb)): emb[i] = embeddings.mean_center_embeddingwise(emb[i]) # Step 1: Optimization if args.verbose: print('Beginning Optimization') start_time = time.time() mean_size=0 for tp in range(len(train_pairs)): src_indices = train_pairs[tp][2] trg_indices = train_pairs[tp][3] x_count = len(set(src_indices)) z_count = len(set(trg_indices)) A = np.zeros((x_count,z_count)) # Creating dictionary matrix from training set map_dict_src={} map_dict_trg={} I=0 uniq_src=[] uniq_trg=[] for i in range(len(src_indices)): if src_indices[i] not in map_dict_src.keys(): map_dict_src[src_indices[i]]=I I+=1 uniq_src.append(src_indices[i]) J=0 for j in range(len(trg_indices)): if trg_indices[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices[j]]=J J+=1 uniq_trg.append(trg_indices[j]) for i in range(len(src_indices)): A[map_dict_src[src_indices[i]],map_dict_trg[trg_indices[i]]]=1 train_pairs[tp].append(uniq_src) train_pairs[tp].append(uniq_trg) train_pairs[tp].append(A) mean_size+= (len(uniq_src)*len(uniq_trg)) mean_size = mean_size/len(train_pairs) np.random.seed(0) Lambda=args.l2_reg variables=[] manif = [] low_rank=emb[0].shape[1] for i in range(len(emb)): variables.append(TT.matrix()) manif.append(Stiefel(emb[i].shape[1],low_rank)) variables.append(TT.matrix()) manif.append(PositiveDefinite(low_rank)) B = variables[-1] cost = 0.5*Lambda*(TT.sum(B**2)) for i in range(len(train_pairs)): x = emb[train_pairs[i][0]] z = emb[train_pairs[i][1]] U1 = variables[train_pairs[i][0]] U2 = variables[train_pairs[i][1]] cost = cost + TT.sum(((shared(x[train_pairs[i][4]]).dot(U1.dot(B.dot(U2.T)))).dot(shared(z[train_pairs[i][5]]).T)-shared(train_pairs[i][6]))**2)/float(len(train_pairs[i][2])) solver = ConjugateGradient(maxtime=args.max_opt_time,maxiter=args.max_opt_iter,mingradnorm=1e-12) manifold =Product(manif) problem = Problem(manifold=manifold, cost=cost, arg=variables, verbosity=3) wopt = solver.solve(problem) w= wopt U1 = w[0] U2 = w[1] B = w[2] # Step 2: Transformation Bhalf = scipy.linalg.sqrtm(wopt[-1]) test_emb = [] for i in range(len(emb)): test_emb.append(emb[i].dot(wopt[i]).dot(Bhalf)) end_time = time.time() if args.verbose: print('Completed training in {0:.2f} seconds'.format(end_time-start_time)) gc.collect() # Step 3: Evaluation if args.verbose: print('Beginning Evaluation') if args.normalize_eval: for i in range(len(test_emb)): test_emb[i] = embeddings.length_normalize(test_emb[i]) # Loading test dictionary with open(args.dictionary_test_file, encoding=args.encoding, errors='surrogateescape') as ff: for line in ff: vals = line.split(',') curr_dict=[int(vals[0].strip()),int(vals[1].strip())] with open(vals[2].strip(), encoding=args.encoding, errors='surrogateescape') as f: src_word2ind = word2ind[curr_dict[0]] trg_word2ind = word2ind[curr_dict[1]] xw = test_emb[curr_dict[0]] zw = test_emb[curr_dict[1]] src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src=src.lower() trg=trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) t=time.time() nbrhood_x=np.zeros(xw.shape[0]) nbrhood_z=np.zeros(zw.shape[0]) nbrhood_z2=cp.zeros(zw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_x = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1) nbrhood_x[src[i:j]]=np.mean(similarities_x[:,:args.csls_neighbourhood],axis=1) batch_num=1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1*cp.partition(-1*cp.dot(cp.asarray(zw[i:j]),cp.transpose(cp.asarray(xw))),args.csls_neighbourhood-1 ,axis=1)[:,:args.csls_neighbourhood] nbrhood_z2[i:j]=(cp.mean(similarities[:,:args.csls_neighbourhood],axis=1)) batch_num+=1 nbrhood_z=cp.asnumpy(nbrhood_z2) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]])- nbrhood_z nn = similarities.argmax(axis=1).tolist() similarities = np.argsort((similarities),axis=1) nn5 = (similarities[:,-5:]) nn10 = (similarities[:,-10:]) for k in range(j-i): translation[src[i+k]] = nn[k] translation5[src[i+k]] = nn5[k] translation10[src[i+k]] = nn10[k] accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean=0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean+=1 break mean/=len(src) accuracy5 = mean mean=0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean+=1 break mean/=len(src) accuracy10 = mean print('Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}'.format(coverage, accuracy, accuracy5, accuracy10))
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument( '--max_vocab', default=0, type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary') parser.add_argument('--verbose', default=0, type=int, help='Verbose') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments') mapping_group.add_argument( '-dtrain', '--dictionary_train', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument( '-dtest', '--dictionary_test', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') mapping_group.add_argument( '-dtrainspl', '--dictionary_trainspl', default=sys.stdin.fileno(), help='the training dictionary split file (defaults to stdin)') mapping_group.add_argument( '-dvalspl', '--dictionary_valspl', default=sys.stdin.fileno(), help='the validation dictionary split file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') geomm_group = parser.add_argument_group('GeoMM arguments', 'Arguments for GeoMM method') geomm_group.add_argument('--l2_reg', type=float, default=1e-1, help='Lambda for L2 Regularization') geomm_group.add_argument( '--max_opt_time', type=int, default=5000, help='Maximum time limit for optimization in seconds') geomm_group.add_argument( '--max_opt_iter', type=int, default=150, help='Maximum number of iterations for optimization') geomm_group.add_argument( '--x_cutoff', type=int, default=25000, help='Vocabulary cutoff for first language for bootstrapping') geomm_group.add_argument( '--z_cutoff', type=int, default=25000, help='Vocabulary cutoff for second language for bootstrapping') geomm_group.add_argument( '--patience', type=int, default=1, help= 'Number of iterations with a decrease in validation accuracy permissible during bootstrapping' ) eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation') eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time') eval_group.add_argument('--eval_batch_size', type=int, default=500, help='Batch size for evaluation') eval_group.add_argument('--csls_neighbourhood', type=int, default=10, help='Neighbourhood size for CSLS') args = parser.parse_args() BATCH_SIZE = args.eval_batch_size # Logging method_name = os.path.join('logs', 'geomm_semi') directory = os.path.join( os.path.join(os.getcwd(), method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) if not os.path.exists(directory): os.makedirs(directory) log_file_name, file_extension = os.path.splitext( os.path.basename(args.dictionary_train)) log_file_name = log_file_name + '.log' class Logger(object): def __init__(self): self.terminal = sys.stdout self.log = open(os.path.join(directory, log_file_name), "a") def write(self, message): self.terminal.write(message) self.log.write(message) def flush(self): #this flush method is needed for python 3 compatibility. #this handles the flush command by doing nothing. #you might want to specify some extra behavior here. pass sys.stdout = Logger() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading train data...') # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, max_voc=args.max_vocab, dtype=dtype) trg_words, z = embeddings.read(trgfile, max_voc=args.max_vocab, dtype=dtype) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary src_indices = [] trg_indices = [] f = open(args.dictionary_train, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) f.close() src_indices = src_indices trg_indices = trg_indices src_indices_train = list(src_indices) trg_indices_train = list(trg_indices) src_indices = [] trg_indices = [] # Loading train-split dictionary f = open(args.dictionary_trainspl, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) f.close() if args.verbose: print('Normalizing embeddings...') # STEP 0: Normalization for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) orig_src = src_indices orig_trg = trg_indices best_val_acc = 0 best_add_src = [] best_add_trg = [] add_src = [] add_trg = [] if args.verbose: print('Beginning Optimization') start_time = time.time() it_count = 0 drop_count = 0 # Bootstrap loop while True: if args.verbose: print('Starting bootstrap iteration {0}'.format(it_count + 1)) # Step 1.1: Optimization x_count = len(set(src_indices)) z_count = len(set(trg_indices)) # Creating dictionary matrix from training set map_dict_src = {} map_dict_trg = {} I = 0 uniq_src = [] uniq_trg = [] for i in range(len(src_indices)): if src_indices[i] not in map_dict_src.keys(): map_dict_src[src_indices[i]] = I I += 1 uniq_src.append(src_indices[i]) J = 0 for j in range(len(trg_indices)): if trg_indices[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices[j]] = J J += 1 uniq_trg.append(trg_indices[j]) np.random.seed(0) Lambda = args.l2_reg U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() X_tot = x[uniq_src].T.dot(x[uniq_src]) Z_tot = z[uniq_trg].T.dot(z[uniq_trg]) W = U1.dot(B.dot(U2.T)) cost = (TT.nlinalg.trace( U2.dot( B.dot( U1.T.dot( shared(X_tot).dot( U1.dot(B.dot(U2.T.dot(shared(Z_tot))))))))) - 2 * TT.sum( (shared(x[src_indices]).dot(W)) * shared(z[trg_indices])) ) / (len(src_indices)) + 0.5 * Lambda * (TT.sum(B**2)) solver = ConjugateGradient(maxtime=args.max_opt_time, maxiter=args.max_opt_iter, mingradnorm=1e-15) low_rank = 300 manifold = Product([ Stiefel(x.shape[1], low_rank), Stiefel(z.shape[1], low_rank), PositiveDefinite(low_rank) ]) problem = Problem(manifold=manifold, cost=cost, arg=[U1, U2, B], verbosity=3) wopt = solver.solve(problem) w = wopt U1 = w[0] U2 = w[1] B = w[2] # Step 1.2: Transformation xw = x.dot(U1).dot(scipy.linalg.sqrtm(B)) zw = z.dot(U2).dot(scipy.linalg.sqrtm(B)) it_count += 1 # Step 1.3: Compute Validation Accuracy if args.normalize_eval: xw = embeddings.length_normalize(xw) zw = embeddings.length_normalize(zw) # Loading validation dictionary f = open(args.dictionary_valspl, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) t = time.time() nbrhood_x = cp.zeros(xw.shape[0]) nbrhood_z = cp.zeros(zw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(xw[src[i:j]]), cp.transpose(cp.asarray(zw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_x[src[i:j]] = (cp.mean(similarities, axis=1)) for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z[i:j] = (cp.mean(similarities, axis=1)) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = cp.transpose( cp.transpose(2 * cp.asarray(xw[src[i:j]]).dot( cp.transpose(cp.asarray(zw)))) - nbrhood_x[src[i:j]]) - nbrhood_z nn = cp.argmax(similarities, axis=1).tolist() similarities = cp.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k].tolist() translation10[src[i + k]] = nn10[k].tolist() accuracy = np.mean( [1 if translation[i] in src2trg[i] else 0 for i in src]) mean = 0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy5 = mean mean = 0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy10 = mean drop_count += 1 if accuracy > best_val_acc: if args.verbose: print('Improvement of {0}% over best validation accuracy!'. format((accuracy - best_val_acc) * 100)) best_val_acc = accuracy best_add_src = list(add_src) best_add_trg = list(add_trg) drop_count = 0 if args.verbose: print( 'Val Set:- Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}' .format(coverage, accuracy, accuracy5, accuracy10)) if drop_count >= args.patience: if args.verbose: print('Training ended') break # Step 1.4: Dictionary Induction Stage (Bootstrap) # Consider x_cutoff and z_cutoff to be the vocabulary of the two languages(First k words of vocabulary are the most frequent words in the language(as per standard word embeddings)). # CSLS Inferencing will be performed on this vocabulary subset. Bidirectional bootstrapping is performed. # Dictionary entries for first "x_cutoff" words of Language-1 and for first "z-cutoff" words of Language-2 are inferred. Original training dictionary is also added. # Total dictionary size=x_cutoff+z_cutoff+size(train_set) if args.normalize_eval: xw = embeddings.length_normalize(xw) zw = embeddings.length_normalize(zw) x_vocab_size = min(xw.shape[0], args.x_cutoff) z_vocab_size = min(zw.shape[0], args.z_cutoff) t = time.time() nbrhood_x = cp.zeros(x_vocab_size) best_sim_x = cp.zeros(x_vocab_size) best_sim_x_csls = cp.zeros(x_vocab_size) nbrhood_z = cp.zeros(z_vocab_size) batch_num = 1 for i in range(0, x_vocab_size, BATCH_SIZE): j = min(i + BATCH_SIZE, x_vocab_size) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(xw[i:j]), cp.transpose(cp.asarray(zw[:z_vocab_size]))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_x[i:j] = (cp.mean(similarities, axis=1)) best_sim_x[i:j] = (cp.max(similarities, axis=1)) batch_num += 1 batch_num = 1 for i in range(0, z_vocab_size, BATCH_SIZE): j = min(i + BATCH_SIZE, z_vocab_size) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw[:x_vocab_size]))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z[i:j] = (cp.mean(similarities, axis=1)) batch_num += 1 src_indices = list(range(0, x_vocab_size)) trg_indices = [] batch_num = 1 for i in range(0, x_vocab_size, BATCH_SIZE): j = min(i + BATCH_SIZE, x_vocab_size) similarities = cp.transpose( cp.transpose(2 * cp.asarray(xw[i:j]).dot( cp.transpose(cp.asarray(zw[:z_vocab_size])))) - nbrhood_x[i:j]) - nbrhood_z nn = cp.argmax(similarities, axis=1).tolist() trg_indices.append(nn) batch_num += 1 src_indices2 = [] trg_indices2 = list(range(0, z_vocab_size)) batch_num = 1 for i in range(0, z_vocab_size, BATCH_SIZE): j = min(i + BATCH_SIZE, z_vocab_size) similarities = cp.transpose( cp.transpose(2 * cp.asarray(zw[i:j]).dot( cp.transpose(cp.asarray(xw[:x_vocab_size])))) - nbrhood_z[i:j]) - nbrhood_x nn = cp.argmax(similarities, axis=1).tolist() src_indices2.append(nn) batch_num += 1 trg_indices = [item for sublist in trg_indices for item in sublist] src_indices2 = [item for sublist in src_indices2 for item in sublist] add_src = list(src_indices + src_indices2) add_trg = list(trg_indices + trg_indices2) src_indices = src_indices + src_indices2 + orig_src trg_indices = trg_indices + trg_indices2 + orig_trg end_time = time.time() if args.verbose: print('Completed bootstrapping in {0:.2f} seconds'.format(end_time - start_time)) # Step 2: Final Training with bootstrapped dictionary if args.verbose: print('Training final model') src_indices = best_add_src + src_indices_train trg_indices = best_add_trg + trg_indices_train x_count = len(set(src_indices)) z_count = len(set(trg_indices)) # Creating dictionary matrix from training set map_dict_src = {} map_dict_trg = {} I = 0 uniq_src = [] uniq_trg = [] for i in range(len(src_indices)): if src_indices[i] not in map_dict_src.keys(): map_dict_src[src_indices[i]] = I I += 1 uniq_src.append(src_indices[i]) J = 0 for j in range(len(trg_indices)): if trg_indices[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices[j]] = J J += 1 uniq_trg.append(trg_indices[j]) np.random.seed(0) Lambda = args.l2_reg U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() X_tot = x[uniq_src].T.dot(x[uniq_src]) Z_tot = z[uniq_trg].T.dot(z[uniq_trg]) W = U1.dot(B.dot(U2.T)) cost = (TT.nlinalg.trace( U2.dot( B.dot( U1.T.dot( shared(X_tot).dot(U1.dot(B.dot(U2.T.dot(shared(Z_tot))))))) )) - 2 * TT.sum( (shared(x[src_indices]).dot(W)) * shared(z[trg_indices])) ) / len(src_indices) + 0.5 * Lambda * (TT.sum(B**2)) solver = ConjugateGradient(maxtime=args.max_opt_time, maxiter=args.max_opt_iter) low_rank = 300 manifold = Product([ Stiefel(x.shape[1], low_rank), Stiefel(z.shape[1], low_rank), PositiveDefinite(low_rank) ]) problem = Problem(manifold=manifold, cost=cost, arg=[U1, U2, B], verbosity=3) wopt = solver.solve(problem) w = wopt U1 = w[0] U2 = w[1] B = w[2] xw = x.dot(U1).dot(scipy.linalg.sqrtm(B)) zw = z.dot(U2).dot(scipy.linalg.sqrtm(B)) gc.collect() # Step 3: Evaluation if args.verbose: print('Beginning Evaluation') if args.normalize_eval: xw = embeddings.length_normalize(xw) zw = embeddings.length_normalize(zw) # Loading test dictionary f = open(args.dictionary_test, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_x = -1 * np.partition( -1 * similarities, args.csls_neighbourhood - 1, axis=1) nbrhood_x[src[i:j]] = np.mean( similarities_x[:, :args.csls_neighbourhood], axis=1) batch_num = 1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z2[i:j] = (cp.mean(similarities, axis=1)) batch_num += 1 nbrhood_z = cp.asnumpy(nbrhood_z2) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z nn = similarities.argmax(axis=1).tolist() similarities = np.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean = 0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy5 = mean mean = 0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy10 = mean print( 'Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}' .format(coverage, accuracy, accuracy5, accuracy10))
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('mid_input', help='the input pivot embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument( '--max_vocab', default=0, type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary') parser.add_argument('--verbose', default=0, type=int, help='Verbose') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments') mapping_group.add_argument( '-dtrain1', '--dictionary_train1', default=sys.stdin.fileno(), help='the first training dictionary file (defaults to stdin)') mapping_group.add_argument( '-dtrain2', '--dictionary_train2', default=sys.stdin.fileno(), help='the second training dictionary file (defaults to stdin)') mapping_group.add_argument( '-dtest', '--dictionary_test', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') geomm_group = parser.add_argument_group('GeoMM arguments', 'Arguments for GeoMM method') geomm_group.add_argument('--l2_reg', type=float, default=1e2, help='Lambda for L2 Regularization') geomm_group.add_argument( '--max_opt_time', type=int, default=5000, help='Maximum time limit for optimization in seconds') geomm_group.add_argument( '--max_opt_iter', type=int, default=150, help='Maximum number of iterations for optimization') eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation') eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time') eval_group.add_argument('--eval_batch_size', type=int, default=1000, help='Batch size for evaluation') eval_group.add_argument('--csls_neighbourhood', type=int, default=10, help='Neighbourhood size for CSLS') args = parser.parse_args() BATCH_SIZE = args.eval_batch_size # Logging method_name = os.path.join('logs', 'geomm_cmp_pip') directory = os.path.join( os.path.join(os.getcwd(), method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) if not os.path.exists(directory): os.makedirs(directory) log_file_name, file_extension = os.path.splitext( os.path.basename(args.dictionary_test)) log_file_name = log_file_name + '.log' class Logger(object): def __init__(self): self.terminal = sys.stdout self.log = open(os.path.join(directory, log_file_name), "a") def write(self, message): self.terminal.write(message) self.log.write(message) def flush(self): #this flush method is needed for python 3 compatibility. #this handles the flush command by doing nothing. #you might want to specify some extra behavior here. pass sys.stdout = Logger() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading train data...') # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') midfile = open(args.mid_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, max_voc=args.max_vocab, dtype=dtype) mid_words, y = embeddings.read(midfile, max_voc=args.max_vocab, dtype=dtype) trg_words, z = embeddings.read(trgfile, max_voc=args.max_vocab, dtype=dtype) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} mid_word2ind = {word: i for i, word in enumerate(mid_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary-1 src_indices12 = [] trg_indices12 = [] f = open(args.dictionary_train1, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = mid_word2ind[trg] src_indices12.append(src_ind) trg_indices12.append(trg_ind) except KeyError: if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) f.close() # Build training dictionary-2 src_indices23 = [] trg_indices23 = [] f = open(args.dictionary_train2, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = mid_word2ind[src] trg_ind = trg_word2ind[trg] src_indices23.append(src_ind) trg_indices23.append(trg_ind) except KeyError: if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) f.close() if args.verbose: print('Normalizing embeddings...') # STEP 0: Normalization for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) y = embeddings.length_normalize(y) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) y = embeddings.mean_center(y) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) y = embeddings.length_normalize_dimensionwise(y) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) y = embeddings.mean_center_embeddingwise(y) z = embeddings.mean_center_embeddingwise(z) # Step 1.1: Optimization-1 if args.verbose: print('Beginning Optimization-1') start_time = time.time() x_count = len(set(src_indices12)) y_count = len(set(trg_indices12)) A = np.zeros((x_count, y_count)) # Creating dictionary matrix from training set map_dict_src = {} map_dict_trg = {} I = 0 uniq_src = [] uniq_trg = [] for i in range(len(src_indices12)): if src_indices12[i] not in map_dict_src.keys(): map_dict_src[src_indices12[i]] = I I += 1 uniq_src.append(src_indices12[i]) J = 0 for j in range(len(trg_indices12)): if trg_indices12[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices12[j]] = J J += 1 uniq_trg.append(trg_indices12[j]) for i in range(len(src_indices12)): A[map_dict_src[src_indices12[i]], map_dict_trg[trg_indices12[i]]] = 1 np.random.seed(0) Lambda = args.l2_reg U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() cost = TT.sum(((shared(x[uniq_src]).dot(U1.dot(B.dot(U2.T)))).dot( shared(y[uniq_trg]).T) - A)**2) + 0.5 * Lambda * (TT.sum(B**2)) solver = ConjugateGradient(maxtime=args.max_opt_time, maxiter=args.max_opt_iter) low_rank = 300 manifold = Product([ Stiefel(x.shape[1], low_rank), Stiefel(y.shape[1], low_rank), PositiveDefinite(low_rank) ]) problem = Problem(manifold=manifold, cost=cost, arg=[U1, U2, B], verbosity=3) wopt = solver.solve(problem) w = wopt U1 = w[0] U2 = w[1] B = w[2] w12 = U1.dot(B).dot(U2.T) u11 = U1 u21 = U2 b1 = B # Step 1.2: Optimization-2 if args.verbose: print('Beginning Optimization-2') y_count = len(set(src_indices23)) z_count = len(set(trg_indices23)) A = np.zeros((y_count, z_count)) # Creating dictionary matrix from training set map_dict_src = {} map_dict_trg = {} I = 0 uniq_src = [] uniq_trg = [] for i in range(len(src_indices23)): if src_indices23[i] not in map_dict_src.keys(): map_dict_src[src_indices23[i]] = I I += 1 uniq_src.append(src_indices23[i]) J = 0 for j in range(len(trg_indices23)): if trg_indices23[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices23[j]] = J J += 1 uniq_trg.append(trg_indices23[j]) for i in range(len(src_indices23)): A[map_dict_src[src_indices23[i]], map_dict_trg[trg_indices23[i]]] = 1 np.random.seed(0) U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() cost = TT.sum(((shared(y[uniq_src]).dot(U1.dot(B.dot(U2.T)))).dot( shared(z[uniq_trg]).T) - A)**2) + 0.5 * Lambda * (TT.sum(B**2)) solver = ConjugateGradient(maxtime=args.max_opt_time, maxiter=args.max_opt_iter) low_rank = 300 manifold = Product([ Stiefel(y.shape[1], low_rank), Stiefel(z.shape[1], low_rank), PositiveDefinite(low_rank) ]) problem = Problem(manifold=manifold, cost=cost, arg=[U1, U2, B], verbosity=3) wopt = solver.solve(problem) w = wopt U1 = w[0] U2 = w[1] B = w[2] w23 = U1.dot(B).dot(U2.T) u22 = U1 u32 = U2 b2 = B # Step 2: Transformation w12_1 = u11.dot(scipy.linalg.sqrtm(b1)) w12_2 = u21.dot(scipy.linalg.sqrtm(b1)) w23_1 = u22.dot(scipy.linalg.sqrtm(b2)) w23_2 = u32.dot(scipy.linalg.sqrtm(b2)) end_time = time.time() if args.verbose: print('Completed training in {0:.2f} seconds'.format(end_time - start_time)) gc.collect() # Step 3: Evaluation # Loading test dictionary f = open(args.dictionary_test, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() # Composition (CMP) xw = x.dot(w12).dot(w23) zw = z if args.normalize_eval: xw = embeddings.length_normalize(xw) zw = embeddings.length_normalize(zw) translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_x = -1 * np.partition( -1 * similarities, args.csls_neighbourhood - 1, axis=1) nbrhood_x[src[i:j]] = np.mean( similarities_x[:, :args.csls_neighbourhood], axis=1) batch_num = 1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z2[i:j] = (cp.mean(similarities[:, :args.csls_neighbourhood], axis=1)) batch_num += 1 nbrhood_z = cp.asnumpy(nbrhood_z2) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z nn = similarities.argmax(axis=1).tolist() similarities = np.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean = 0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy5 = mean mean = 0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy10 = mean print( 'CMP: Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}' .format(coverage, accuracy, accuracy5, accuracy10)) # Pipeline (PIP) xw = x.dot(w12_1) zw = y.dot(w12_2) if args.normalize_eval: xw = embeddings.length_normalize(xw) zw = embeddings.length_normalize(zw) translation12 = collections.defaultdict(int) # PIP-Stage 1 t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_x = -1 * np.partition( -1 * similarities, args.csls_neighbourhood - 1, axis=1) nbrhood_x[src[i:j]] = np.mean( similarities_x[:, :args.csls_neighbourhood], axis=1) batch_num = 1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z2[i:j] = (cp.mean(similarities[:, :args.csls_neighbourhood], axis=1)) batch_num += 1 nbrhood_z = cp.asnumpy(nbrhood_z2) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z nn = similarities.argmax(axis=1).tolist() for k in range(j - i): translation[src[i + k]] = nn[k] # PIP-Stage 2 mid = [translation[sr] for sr in src] xw = y.dot(w23_1) zw = z.dot(w23_2) if args.normalize_eval: xw = embeddings.length_normalize(xw) zw = embeddings.length_normalize(zw) translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) for i in range(0, len(mid), BATCH_SIZE): j = min(i + BATCH_SIZE, len(mid)) similarities = xw[mid[i:j]].dot(zw.T) # similarities_x = np.sort(similarities, axis=1) similarities_x = -1 * np.partition( -1 * similarities, args.csls_neighbourhood - 1, axis=1) nbrhood_x[mid[i:j]] = np.mean( similarities_x[:, :args.csls_neighbourhood], axis=1) batch_num = 1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z2[i:j] = (cp.mean(similarities[:, :args.csls_neighbourhood], axis=1)) batch_num += 1 nbrhood_z = cp.asnumpy(nbrhood_z2) for i in range(0, len(mid), BATCH_SIZE): j = min(i + BATCH_SIZE, len(mid)) similarities = xw[mid[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - nbrhood_x[mid[i:j]]) - nbrhood_z nn = similarities.argmax(axis=1).tolist() similarities = np.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean = 0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy5 = mean mean = 0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy10 = mean print( 'PIP: Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}' .format(coverage, accuracy, accuracy5, accuracy10))
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp64', help='the floating-point precision (defaults to fp64)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)') mapping_group.add_argument( '-d', '--dictionary', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group( 'self-learning arguments', 'Optional arguments for self-learning (ACL 2017)') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument( '--direction', choices=['forward', 'backward', 'union'], default='forward', help='the direction for dictionary induction (defaults to forward)') self_learning_group.add_argument( '--numerals', action='store_true', help= 'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary' ) self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--validation', default=None, help='a dictionary file for validation at each iteration') self_learning_group.add_argument( '--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') advanced_group = parser.add_argument_group( 'advanced mapping arguments', 'Advanced embedding mapping arguments (AAAI 2018)') advanced_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') advanced_group.add_argument( '--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') advanced_group.add_argument( '--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') advanced_group.add_argument( '--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') advanced_group.add_argument( '--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') advanced_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary src_indices = [] trg_indices = [] if args.numerals: if args.dictionary != sys.stdin.fileno(): print('WARNING: Using numerals instead of the training dictionary', file=sys.stderr) numeral_regex = re.compile('^[0-9]+$') src_numerals = { word for word in src_words if numeral_regex.match(word) is not None } trg_numerals = { word for word in trg_words if numeral_regex.match(word) is not None } numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # STEP 0: Normalization for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) # Training loop prev_objective = objective = -100. it = 1 t = time.time() while it == 1 or objective - prev_objective >= args.threshold: # Update the embedding mapping if args.orthogonal: # orthogonal mapping u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) xw = x.dot(w) zw = z elif args.unconstrained: # unconstrained mapping x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot( x[src_indices])).dot(x[src_indices].T) w = x_pseudoinv.dot(z[trg_indices]) xw = x.dot(w) zw = z else: # advanced mapping xw = x zw = z # STEP 1: Whitening def whitening_transformation(m): u, s, vt = xp.linalg.svd(m, full_matrices=False) return vt.T.dot(xp.diag(1 / s)).dot(vt) if args.whiten: wx1 = whitening_transformation(xw[src_indices]) wz1 = whitening_transformation(zw[trg_indices]) xw = xw.dot(wx1) zw = zw.dot(wz1) # STEP 2: Orthogonal mapping wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot( zw[trg_indices])) wz2 = wz2_t.T xw = xw.dot(wx2) zw = zw.dot(wz2) # STEP 3: Re-weighting xw *= s**args.src_reweight zw *= s**args.trg_reweight # STEP 4: De-whitening if args.src_dewhiten == 'src': xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.src_dewhiten == 'trg': xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) if args.trg_dewhiten == 'src': zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.trg_dewhiten == 'trg': zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) # STEP 5: Dimensionality reduction if args.dim_reduction > 0: xw = xw[:, :args.dim_reduction] zw = zw[:, :args.dim_reduction] # Self-learning if args.self_learning: # Update the training dictionary best_sim_forward = xp.full(x.shape[0], -100, dtype=dtype) src_indices_forward = xp.arange(x.shape[0]) trg_indices_forward = xp.zeros(x.shape[0], dtype=int) best_sim_backward = xp.full(z.shape[0], -100, dtype=dtype) src_indices_backward = xp.zeros(z.shape[0], dtype=int) trg_indices_backward = xp.arange(z.shape[0]) for i in range(0, x.shape[0], MAX_DIM_X): j = min(x.shape[0], i + MAX_DIM_X) for k in range(0, z.shape[0], MAX_DIM_Z): l = min(z.shape[0], k + MAX_DIM_Z) sim = xw[i:j].dot(zw[k:l].T) if args.direction in ('forward', 'union'): ind = sim.argmax(axis=1) val = sim[xp.arange(sim.shape[0]), ind] ind += k mask = (val > best_sim_forward[i:j]) best_sim_forward[i:j][mask] = val[mask] trg_indices_forward[i:j][mask] = ind[mask] if args.direction in ('backward', 'union'): ind = sim.argmax(axis=0) val = sim[ind, xp.arange(sim.shape[1])] ind += i mask = (val > best_sim_backward[k:l]) best_sim_backward[k:l][mask] = val[mask] src_indices_backward[k:l][mask] = ind[mask] if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = xp.concatenate( (src_indices_forward, src_indices_backward)) trg_indices = xp.concatenate( (trg_indices_forward, trg_indices_backward)) # Objective function evaluation prev_objective = objective if args.direction == 'forward': objective = xp.mean(best_sim_forward).tolist() elif args.direction == 'backward': objective = xp.mean(best_sim_backward).tolist() elif args.direction == 'union': objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) sim = xw[src].dot(zw.T) # TODO Assuming that it fits in memory nn = asnumpy(sim.argmax(axis=1)) accuracy = np.mean([ 1 if nn[i] in validation[src[i]] else 0 for i in range(len(src)) ]) similarity = np.mean([ max([sim[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src)) ]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format( 100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format( it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, zw, trgfile) srcfile.close() trgfile.close()