def make_dataset(self, root): root = root.split(' ') view1 = open(root[0], encoding='utf-8', errors='surrogateescape') view2 = open(root[1], encoding='utf-8', errors='surrogateescape') src_words, view1_vec = embeddings.read(view1) trg_words, view2_vec = embeddings.read(view2) view1_vec = embeddings.length_normalize(view1_vec) view2_vec = embeddings.length_normalize(view2_vec) view1.close() view2.close() return torch.from_numpy(np.column_stack((view1_vec, view2_vec)))
def normalize_emb(emb, method): """ Normalize input embedding based on the choice of method """ print(f"Normalizing using {method}") if method == 'unit': emb = embeddings.length_normalize(emb) elif method == 'center': emb = embeddings.mean_center(emb) elif method == 'unitdim': emb = embeddings.length_normalize_dimensionwise(emb) elif method == 'centeremb': emb = embeddings.mean_center_embeddingwise(emb) return emb
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Normalize word embeddings') parser.add_argument( 'actions', choices=['none', 'unit', 'center', 'unitdim', 'centeremb'], nargs='+', help='the actions to perform in order') parser.add_argument( '-i', '--input', default=sys.stdin.fileno(), help='the input word embedding file (defaults to stdin)') parser.add_argument( '-o', '--output', default=sys.stdout.fileno(), help='the output word embedding file (defaults to stdout)') parser.add_argument( '--encoding', default='utf-8', action='store_true', help='the character encoding for input/output (defaults to utf-8)') args = parser.parse_args() # Read input embeddings f = open(args.input, encoding=args.encoding, errors='surrogateescape') words, matrix = embeddings.read(f) # Perform normalization actions for action in args.actions: if action == 'unit': matrix = embeddings.length_normalize(matrix) elif action == 'center': matrix = embeddings.mean_center(matrix) elif action == 'unitdim': matrix = embeddings.length_normalize_dimensionwise(matrix) elif action == 'centeremb': matrix = embeddings.mean_center_embeddingwise(matrix) # Write normalized embeddings f = open(args.output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(words, matrix, f)
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Generate latent space embeddings') parser.add_argument('emb1', help='path to embedding 1') parser.add_argument('emb2', help='path to embedding 2') parser.add_argument( '--geomm_embeddings_path', default=None, type=str, help= 'directory to save the output GeoMM latent space embeddings. The output embeddings are normalized.' ) parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--verbose', default=0, type=int, help='Verbose') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments') mapping_group.add_argument('--dictionary', default=sys.stdin.fileno(), help='the dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'no'], nargs=2, default=[], help= 'the normalization actions performed in sequence for embeddings 1 and 2' ) geomm_group = parser.add_argument_group('GeoMM arguments', 'Arguments for GeoMM method') geomm_group.add_argument('--l2_reg', type=float, default=1e2, help='Lambda for L2 Regularization') geomm_group.add_argument( '--max_opt_time', type=int, default=5000, help='Maximum time limit for optimization in seconds') geomm_group.add_argument( '--max_opt_iter', type=int, default=150, help='Maximum number of iterations for optimization') args = parser.parse_args() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading embeddings data...') # Read input embeddings emb1file = open(args.emb1, encoding=args.encoding, errors='surrogateescape') emb2file = open(args.emb2, encoding=args.encoding, errors='surrogateescape') emb1_words, x = embeddings.read(emb1file, max_voc=0, dtype=dtype) emb2_words, z = embeddings.read(emb2file, max_voc=0, dtype=dtype) # Build word to index map emb1_word2ind = {word: i for i, word in enumerate(emb1_words)} emb2_word2ind = {word: i for i, word in enumerate(emb2_words)} noov = 0 emb1_indices = [] emb2_indices = [] f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: emb1, emb2 = line.split() try: emb1_ind = emb1_word2ind[emb1] emb2_ind = emb2_word2ind[emb2] emb1_indices.append(emb1_ind) emb2_indices.append(emb2_ind) except KeyError: noov += 1 if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format( emb1, emb2)) #, file=sys.stderr f.close() if args.verbose: print('Number of embedding pairs having at least one OOV: {}'.format( noov)) emb1_indices = emb1_indices emb2_indices = emb2_indices if args.verbose: print('Normalizing embeddings...') # STEP 0: Normalization if len(args.normalize) > 0: x = normalize_emb(x, args.normalize[0]) z = normalize_emb(z, args.normalize[1]) # Step 1: Optimization if args.verbose: print('Beginning Optimization') start_time = time.time() x_count = len(set(emb1_indices)) z_count = len(set(emb2_indices)) # Filter out uniq values map_dict_emb1 = {} map_dict_emb2 = {} I = 0 uniq_emb1 = [] uniq_emb2 = [] for i in range(len(emb1_indices)): if emb1_indices[i] not in map_dict_emb1.keys(): map_dict_emb1[emb1_indices[i]] = I I += 1 uniq_emb1.append(emb1_indices[i]) J = 0 for j in range(len(emb2_indices)): if emb2_indices[j] not in map_dict_emb2.keys(): map_dict_emb2[emb2_indices[j]] = J J += 1 uniq_emb2.append(emb2_indices[j]) # Creating dictionary matrix row = list(range(0, x_count)) col = list(range(0, x_count)) data = [1 for i in range(0, x_count)] print(f"Counts: {x_count}, {z_count}") A = coo_matrix((data, (row, col)), shape=(x_count, z_count)) np.random.seed(0) Lambda = args.l2_reg U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() Xemb1 = x[uniq_emb1] Zemb2 = z[uniq_emb2] del x, z gc.collect() Kx, Kz = Xemb1, Zemb2 XtAZ = Kx.T.dot(A.dot(Kz)) XtX = Kx.T.dot(Kx) ZtZ = Kz.T.dot(Kz) AA = np.sum(A * A) W = (U1.dot(B)).dot(U2.T) regularizer = 0.5 * Lambda * (TT.sum(B**2)) sXtX = shared(XtX) sZtZ = shared(ZtZ) sXtAZ = shared(XtAZ) cost = regularizer wtxtxw = W.T.dot(sXtX.dot(W)) wtxtxwztz = wtxtxw.dot(sZtZ) cost += TT.nlinalg.trace(wtxtxwztz) cost += -2 * TT.sum(W * sXtAZ) cost += shared(AA) solver = ConjugateGradient(maxtime=args.max_opt_time, maxiter=args.max_opt_iter) manifold = Product([ Stiefel(Kx.shape[1], Kx.shape[1]), Stiefel(Kz.shape[1], Kz.shape[1]), PositiveDefinite(Kx.shape[1]) ]) problem = Problem(manifold=manifold, cost=cost, arg=[U1, U2, B], verbosity=3) wopt = solver.solve(problem) print(f"Problem solved ...") w = wopt U1 = w[0] U2 = w[1] B = w[2] print(f"Model copied ...") gc.collect() # Step 2: Transformation xw = Kx.dot(U1).dot(scipy.linalg.sqrtm(B)) zw = Kz.dot(U2).dot(scipy.linalg.sqrtm(B)) print(f"Transformation done ...") end_time = time.time() if args.verbose: print('Completed training in {0:.2f} seconds'.format(end_time - start_time)) del Kx, Kz, B, U1, U2 gc.collect() ### Save the GeoMM embeddings if requested xw_n = embeddings.length_normalize(xw) zw_n = embeddings.length_normalize(zw) del xw, zw gc.collect() if args.geomm_embeddings_path is not None: os.makedirs(args.geomm_embeddings_path, exist_ok=True) out_emb_fname = os.path.join(args.geomm_embeddings_path, 'emb1.vec') new_emb1_words = [] for id in uniq_emb1: new_emb1_words.append(emb1_words[id]) with open(out_emb_fname, 'w', encoding=args.encoding) as outfile: embeddings.write(new_emb1_words, xw_n, outfile) new_emb2_words = [] for id in uniq_emb2: new_emb2_words.append(emb2_words[id]) out_emb_fname = os.path.join(args.geomm_embeddings_path, 'emb2.vec') with open(out_emb_fname, 'w', encoding=args.encoding) as outfile: embeddings.write(new_emb2_words, zw_n, outfile) exit(0)
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)') mapping_group.add_argument( '-d', '--dictionary', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument( '-c', '--orthogonal', dest='orthogonal', action='store_true', help='use orthogonal constrained mapping (default)') mapping_group.add_argument('-u', '--unconstrained', dest='orthogonal', action='store_false', help='use unconstrained mapping') parser.set_defaults(orthogonal=True) self_learning_group = parser.add_argument_group( 'self-learning arguments', 'Optional arguments for self-learning (ACL 2017)') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument( '--direction', choices=['forward', 'backward', 'union'], default='forward', help='the direction for dictionary induction (defaults to forward)') self_learning_group.add_argument( '--numerals', action='store_true', help= 'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary' ) self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--validation', default=None, help='a dictionary file for validation at each iteration') self_learning_group.add_argument( '--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile) trg_words, z = embeddings.read(trgfile) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary src_indices = [] trg_indices = [] if args.numerals: if args.dictionary != sys.stdin.fileno(): print('WARNING: Using numerals instead of the training dictionary', file=sys.stderr) numeral_regex = re.compile('^[0-9]+$') src_numerals = { word for word in src_words if numeral_regex.match(word) is not None } trg_numerals = { word for word in trg_words if numeral_regex.match(word) is not None } numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: pass oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Normalize embeddings for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) # Training loop prev_objective = objective = -100. it = 1 t = time.time() while it == 1 or objective - prev_objective >= args.threshold: # Update the embedding mapping if args.orthogonal: # orthogonal mapping u, s, vt = np.linalg.svd(np.dot(z[trg_indices].T, x[src_indices])) w = np.dot(vt.T, u.T) else: # unconstrained mapping x_pseudoinv = np.dot( np.linalg.inv(np.dot(x[src_indices].T, x[src_indices])), x[src_indices].T) w = np.dot(x_pseudoinv, z[trg_indices]) xw = x.dot(w) # Self-learning if args.self_learning: # Update the training dictionary best_sim_forward = np.full(x.shape[0], -100.) src_indices_forward = range(x.shape[0]) trg_indices_forward = np.zeros(x.shape[0], dtype=int) best_sim_backward = np.full(z.shape[0], -100.) src_indices_backward = np.zeros(z.shape[0], dtype=int) trg_indices_backward = range(z.shape[0]) for i in range(0, x.shape[0], MAX_DIM_X): for j in range(0, z.shape[0], MAX_DIM_Z): sim = xw[i:i + MAX_DIM_X].dot(z[j:j + MAX_DIM_Z].T) for k in range(sim.shape[0]): l = sim[k].argmax() if sim[k, l] > best_sim_forward[i + k]: best_sim_forward[i + k] = sim[k, l] trg_indices_forward[i + k] = j + l if args.direction in ( 'backward', 'union'): # Slow, only do if necessary for l in range(sim.shape[1]): k = sim[:, l].argmax() if sim[k, l] > best_sim_backward[j + l]: best_sim_backward[j + l] = sim[k, l] src_indices_backward[j + l] = i + k sim = None if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = np.concatenate( (src_indices_forward, src_indices_backward)) trg_indices = np.concatenate( (trg_indices_forward, trg_indices_backward)) # Objective function evaluation prev_objective = objective if args.direction == 'forward': objective = np.mean(best_sim_forward) elif args.direction == 'backward': objective = np.mean(best_sim_backward) elif args.direction == 'union': objective = (np.mean(best_sim_forward) + np.mean(best_sim_backward)) / 2 # Accuracy and similarity evaluation in validation if args.validation is not None: accuracy = np.mean([ 1 if trg_indices_forward[src] in trg else 0 for src, trg in validation.items() ]) similarity = np.mean([ np.max(z[list(trg)].dot(xw[src])) for src, trg in validation.items() ]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format( 100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format( it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, z, trgfile) srcfile.close() trgfile.close()
def translate(words_to_translate, src_emb_info, tgt_emb_info, retrieval_method="csls", csls_k=10, batch_size=2500): print('Hello') sys.stdout.flush() # Read source embeddings src_words, x = src_emb_info src_word2ind = build_w2i(src_words) # Read target embeddings tgt_words, z = tgt_emb_info tgt_word2ind = build_w2i(tgt_words) xw = embeddings.length_normalize(x) zw = embeddings.length_normalize(z) all_words = [] trans_words = [] trans_idx = [] oov = set() for w in words_to_translate: try: all_words.append(w) w_ind = src_word2ind[w] trans_words.append(w) trans_idx.append(w_ind) except KeyError: oov.add(w) print(len(all_words)) print(len(trans_words)) print(len(trans_idx)) print(len(oov)) src = trans_idx print('Number of words to translate: {}'.format(len(src))) translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) if retrieval_method == 'nn': # Standard nearest neighbor for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) nn = similarities.argmax(axis=1).tolist() # similarities_idx = similarities.argsort(axis=1) # nn5 = similarities_idx[:,-5:] # nn10 = similarities_idx[:,-10:] for k in range(j - i): translation[src[i + k]] = nn[k] # translation5[src[i+k]] = nn5[k] # translation10[src[i+k]] = nn10[k] elif retrieval_method == 'csls': t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) print('Computing X Neighbourhood') sys.stdout.flush() # batch_size=1000 batch_num = 1 for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) # similarities_x = np.sort(similarities, axis=1) similarities_x = -1 * np.partition( -1 * similarities, csls_k - 1, axis=1) #similarities_x = -1*cp.partition(-1*cp.dot(cp.asarray(xw[src[i:j]]),cp.transpose(cp.asarray(zw))),csls_k-1 ,axis=1)[:,:csls_k] nbrhood_x[src[i:j]] = np.mean(similarities_x[:, :csls_k], axis=1) print('Completed batch {0} in {1}'.format(batch_num, time.time() - t)) sys.stdout.flush() batch_num += 1 print('Completed in {0} seconds'.format(time.time() - t)) print('Computing Z Neighbourhood') sys.stdout.flush() batch_num = 1 for i in range(0, zw.shape[0], batch_size): j = min(i + batch_size, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), csls_k - 1, axis=1)[:, :csls_k] nbrhood_z2[i:j] = (cp.mean(similarities[:, :csls_k], axis=1)) print('Completed batch {0} in {1}'.format(batch_num, time.time() - t)) sys.stdout.flush() batch_num += 1 # gc.collect() # t=time.time() nbrhood_z = cp.asnumpy(nbrhood_z2) # ipdb.set_trace() print(time.time() - t) print('Computing nearest neighbours') sys.stdout.flush() csls_alpha = 1 batch_num = 1 for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose(np.transpose(2*similarities) \ - csls_alpha*nbrhood_x[src[i:j]]) \ - csls_alpha*nbrhood_z nn = similarities.argmax(axis=1).tolist() # similarities = np.argsort((similarities),axis=1) # nn5 = (similarities[:,-5:]) # nn10 = (similarities[:,-10:]) for k in range(j - i): translation[src[i + k]] = nn[k] # translation5[src[i+k]] = nn5[k] # translation10[src[i+k]] = nn10[k] print('Completed batch {0} in {1}'.format(batch_num, time.time() - t)) sys.stdout.flush() batch_num += 1 print('Completed in {0} seconds'.format(time.time() - t)) sys.stdout.flush() # get translations trans_pairs = [] for w in trans_words: trans = '' if w in src_word2ind: trans = tgt_words[translation[src_word2ind[w]]] trans_pairs.append((w, trans)) return dict(trans_pairs)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Evaluate embeddings of two languages in a shared space in word translation induction') parser.add_argument('src_embeddings', help='the source language embeddings') parser.add_argument('trg_embeddings', help='the target language embeddings') parser.add_argument('-d', '--dictionary', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') parser.add_argument('--retrieval', default='nn', choices=['nn', 'invnn', 'invsoftmax', 'csls'], help='the retrieval method (nn: standard nearest neighbor; invnn: inverted nearest neighbor; invsoftmax: inverted softmax; csls: cross-domain similarity local scaling)') parser.add_argument('--inv_temperature', default=1, type=float, help='the inverse temperature (only compatible with inverted softmax)') parser.add_argument('--inv_sample', default=None, type=int, help='use a random subset of the source vocabulary for the inverse computations (only compatible with inverted softmax)') parser.add_argument('-k', '--neighborhood', default=10, type=int, help='the neighborhood size (only compatible with csls)') parser.add_argument('--dot', action='store_true', help='use the dot product in the similarity computations instead of the cosine') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--seed', type=int, default=0, help='the random seed') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') args = parser.parse_args() # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) print('embeddings read') # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np xp.random.seed(args.seed) # Length normalize embeddings so their dot product effectively computes the cosine similarity if not args.dot: embeddings.length_normalize(x) embeddings.length_normalize(z) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Read dictionary and compute coverage f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) print(f'dictionary read') # Find translations translation = collections.defaultdict(int) if args.retrieval == 'nn': # Standard nearest neighbor for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = x[src[i:j]].dot(z.T) nn = similarities.argmax(axis=1).tolist() for k in range(j-i): translation[src[i+k]] = nn[k] elif args.retrieval == 'invnn': # Inverted nearest neighbor best_rank = np.full(len(src), x.shape[0], dtype=int) best_sim = np.full(len(src), -100, dtype=dtype) for i in range(0, z.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, z.shape[0]) similarities = z[i:j].dot(x.T) ind = (-similarities).argsort(axis=1) ranks = asnumpy(ind.argsort(axis=1)[:, src]) sims = asnumpy(similarities[:, src]) for k in range(i, j): for l in range(len(src)): rank = ranks[k-i, l] sim = sims[k-i, l] if rank < best_rank[l] or (rank == best_rank[l] and sim > best_sim[l]): best_rank[l] = rank best_sim[l] = sim translation[src[l]] = k elif args.retrieval == 'invsoftmax': # Inverted softmax sample = xp.arange(x.shape[0]) if args.inv_sample is None else xp.random.randint(0, x.shape[0], args.inv_sample) partition = xp.zeros(z.shape[0]) for i in range(0, len(sample), BATCH_SIZE): j = min(i + BATCH_SIZE, len(sample)) partition += xp.exp(args.inv_temperature*z.dot(x[sample[i:j]].T)).sum(axis=1) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) p = xp.exp(args.inv_temperature*x[src[i:j]].dot(z.T)) / partition nn = p.argmax(axis=1).tolist() for k in range(j-i): translation[src[i+k]] = nn[k] elif args.retrieval == 'csls': # Cross-domain similarity local scaling knn_sim_bwd = xp.zeros(z.shape[0]) for i in range(0, z.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, z.shape[0]) knn_sim_bwd[i:j] = topk_mean(z[i:j].dot(x.T), k=args.neighborhood, inplace=True) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = 2*x[src[i:j]].dot(z.T) - knn_sim_bwd # Equivalent to the real CSLS scores for NN nn = similarities.argmax(axis=1).tolist() for k in range(j-i): translation[src[i+k]] = nn[k] # Compute accuracy accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) print('Coverage:{0:7.2%} Accuracy:{1:7.2%}'.format(coverage, accuracy))
def translate(src_emb_fname, tgt_emb_fname, trans_tgt_fname, trans_src_fname=None, retrieval_method="csls", csls_k=10, batch_size=2500): print('Loading train data...') srcfile = open(src_emb_fname, 'r', encoding='utf-8', errors='surrogateescape') tgtfile = open(tgt_emb_fname, 'r', encoding='utf-8', errors='surrogateescape') # Read source embeddings src_words, x = embeddings.read(srcfile, max_voc=0, dtype='float32') src_word2ind = {word: i for i, word in enumerate(src_words)} # Read target embeddings tgt_words, z = embeddings.read(tgtfile, max_voc=0, dtype='float32') tgt_word2ind = {word: i for i, word in enumerate(tgt_words)} srcfile.close() tgtfile.close() xw = embeddings.length_normalize(x) zw = embeddings.length_normalize(z) all_words = [] trans_words = [] trans_idx = [] oov = set() if trans_src_fname is not None: with open(trans_src_fname, 'r', encoding='utf-8', errors='surrogateescape') as trans_src_file: for line in trans_src_file: try: #w=line.strip().lower() w = line.strip() all_words.append(w) w_ind = src_word2ind[w] trans_words.append(w) trans_idx.append(w_ind) except KeyError: oov.add(w) else: all_words = src_words trans_words = src_words trans_idx = list(range(len(src_words))) oov = set() print(len(all_words)) print(len(trans_words)) print(len(trans_idx)) print(len(oov)) src = trans_idx translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) if retrieval_method == 'nn': # Standard nearest neighbor for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) nn = similarities.argmax(axis=1).tolist() similarities_idx = similarities.argsort(axis=1) nn5 = similarities_idx[:, -5:] nn10 = similarities_idx[:, -10:] for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] elif retrieval_method == 'csls': t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) print('Computing X Neighbourhood') # batch_size=1000 for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) # similarities_x = np.sort(similarities, axis=1) similarities_x = -1 * np.partition( -1 * similarities, csls_k - 1, axis=1) #similarities_x = -1*cp.partition(-1*cp.dot(cp.asarray(xw[src[i:j]]),cp.transpose(cp.asarray(zw))),csls_k-1 ,axis=1)[:,:csls_k] nbrhood_x[src[i:j]] = np.mean(similarities_x[:, :csls_k], axis=1) print('Completed in {0} seconds'.format(time.time() - t)) print('Computing Z Neighbourhood') batch_num = 1 for i in range(0, zw.shape[0], batch_size): j = min(i + batch_size, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), csls_k - 1, axis=1)[:, :csls_k] nbrhood_z2[i:j] = (cp.mean(similarities[:, :csls_k], axis=1)) print('Completed batch {0} in {1}'.format(batch_num, time.time() - t)) batch_num += 1 # gc.collect() # t=time.time() nbrhood_z = cp.asnumpy(nbrhood_z2) # ipdb.set_trace() print(time.time() - t) csls_alpha = 1 for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - csls_alpha * nbrhood_x[src[i:j]]) - csls_alpha * nbrhood_z nn = similarities.argmax(axis=1).tolist() print(time.time() - t) similarities = np.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] print('Completed in {0} seconds'.format(time.time() - t)) ### write the translations (1 pair per line format) with open(trans_tgt_fname, 'w', encoding='utf-8', errors='surrogateescape') as trans_tgt_file: for w in trans_words: trans = '' if w in src_word2ind: trans = tgt_words[translation[src_word2ind[w]]] trans_tgt_file.write('{}\t{}\n'.format(w, trans))
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Evaluate embeddings in word similarity/relatedness') parser.add_argument('src_embeddings', help='the source language embeddings') parser.add_argument('trg_embeddings', nargs='?', help='the target language embeddings') parser.add_argument('-i', '--input', default=[sys.stdin.fileno()], nargs='+', help='the input datasets (defaults to stdin)') parser.add_argument('-l', '--lowercase', action='store_true', help='lowercase the words in the test files') parser.add_argument('--backoff', default=None, type=float, help='use a backoff similarity score for OOV entries') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp64', help='the floating-point precision (defaults to fp64)') parser.add_argument( '--sim', nargs='*', help='the names of the datasets to include in the similarity results') parser.add_argument( '--rel', nargs='*', help='the names of the datasets to include in the relatedness results') parser.add_argument( '--all', nargs='*', help='the names of the datasets to include in the total results') args = parser.parse_args() # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Parse test files word_pairs = [] golds = [] for filename in args.input: f = open(filename, encoding=args.encoding, errors='surrogateescape') word_pairs.append([]) golds.append([]) for line in f: if args.lowercase: line = line.lower() src, trg, score = line.split('\t') word_pairs[-1].append((src, trg)) golds[-1].append(float(score)) # Build vocabularies src_vocab = {pair[0] for pairs in word_pairs for pair in pairs} trg_vocab = {pair[1] for pairs in word_pairs for pair in pairs} # Read embeddings srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.src_embeddings if args.trg_embeddings is None else args.trg_embeddings, encoding=args.encoding, errors='surrogateescape') src_words, src_matrix = embeddings.read(srcfile, vocabulary=src_vocab, dtype=dtype) trg_words, trg_matrix = embeddings.read(trgfile, vocabulary=trg_vocab, dtype=dtype) # Length normalize embeddings so their dot product effectively computes the cosine similarity src_matrix = embeddings.length_normalize(src_matrix) trg_matrix = embeddings.length_normalize(trg_matrix) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Compute system scores and correlations results = [] for i in range(len(golds)): system = [] gold = [] oov = 0 for gold_score, (src, trg) in zip(golds[i], word_pairs[i]): try: cos = np.dot(src_matrix[src_word2ind[src]], trg_matrix[trg_word2ind[trg]]) system.append(cos) gold.append(gold_score) except KeyError: if args.backoff is None: oov += 1 else: system.append(args.backoff) gold.append(gold_score) name = os.path.splitext(os.path.basename(args.input[i]))[0] coverage = len(system) / (len(system) + oov) pearson = scipy.stats.pearsonr(gold, system)[0] spearman = scipy.stats.spearmanr(gold, system)[0] results.append((name, coverage, pearson, spearman)) print('Coverage:{0:7.2%} Pearson:{1:7.2%} Spearman:{2:7.2%} | {3}'. format(coverage, pearson, spearman, name)) # Compute and print total (averaged) results if len(results) > 1: print('-' * 80) if args.sim is not None: sim = list(zip(*[res for res in results if res[0] in args.sim])) print( 'Coverage:{0:7.2%} Pearson:{1:7.2%} Spearman:{2:7.2%} | sim.' .format(np.mean(sim[1]), np.mean(sim[2]), np.mean(sim[3]))) if args.rel is not None: rel = list(zip(*[res for res in results if res[0] in args.rel])) print( 'Coverage:{0:7.2%} Pearson:{1:7.2%} Spearman:{2:7.2%} | rel.' .format(np.mean(rel[1]), np.mean(rel[2]), np.mean(rel[3]))) if args.all is not None: results = [res for res in results if res[0] in args.all] results = list(zip(*results)) print('Coverage:{0:7.2%} Pearson:{1:7.2%} Spearman:{2:7.2%} | all'. format(np.mean(results[1]), np.mean(results[2]), np.mean(results[3])))
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Evaluate embeddings of two languages in a shared space in word translation induction') parser.add_argument('src_embeddings', help='the source language embeddings') parser.add_argument('trg_embeddings', help='the target language embeddings') parser.add_argument('-d', '--dictionary', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') parser.add_argument('--dot', action='store_true', help='use the dot product in the similarity computations instead of the cosine') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--output', type=str, help='file to write record of correct/incorrect translations') parser.add_argument('--identity', action='store_true', help='do evaluation as normal, but if identity translation is available, use it instead') parser.add_argument('--identity_dict', action='store_true', help='do evaluation as normal, but if identity translation is available within dictionary, use it instead') parser.add_argument('--identity_either', action='store_true', help='do evaluation as normal, but if identity translation is available AND correct, use it instead') args = parser.parse_args() # Read input embeddings srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape') src_words, src_matrix = embeddings.read(srcfile) trg_words, trg_matrix = embeddings.read(trgfile) # Length normalize embeddings so their dot product effectively computes the cosine similarity if not args.dot: src_matrix = embeddings.length_normalize(src_matrix) trg_matrix = embeddings.length_normalize(trg_matrix) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Read dictionary and compute coverage f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) dict_trgs = set() oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] dict_trgs.add(trg) src2trg[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) if args.output: outputfile = open(args.output, mode='w',encoding=args.encoding, errors='surrogateescape') # Compute accuracy correct = 0 src, trg = zip(*src2trg.items()) for i in range(0, len(src2trg), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src2trg)) similarities = src_matrix[list(src[i:j])].dot(trg_matrix.T) nn = np.argmax(similarities, axis=1).tolist() for k in range(j-i): sw = src_words[src[i+k]] tws = [trg_words[t] for t in trg[i+k]] bCor = False guess = trg_words[nn[k]] if args.identity and sw in trg_word2ind: #able to use identity as guess guess = sw if sw in tws: #guessing identity is correct bCor = True correct += 1 #else, guessing identity is incorrect elif args.identity_dict and sw in dict_trgs: guess = sw if sw in tws: bCor = True correct += 1 elif nn[k] in trg[i+k]: correct += 1 bCor = True elif args.identity_either and sw in tws: correct += 1 bCor = True guess = sw if args.output: if bCor: outputfile.write("Correct:{} {} {}\n".format(sw, guess, tws)) else: outputfile.write("Incorrect:{} {} {}\n".format(sw, guess, tws)) print('Coverage:{0:7.2%} Accuracy:{1:7.2%}'.format(coverage, correct / len(src2trg)))
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp64', help='the floating-point precision (defaults to fp64)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)') mapping_group.add_argument( '-d', '--dictionary', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group( 'self-learning arguments', 'Optional arguments for self-learning (ACL 2017)') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument( '--direction', choices=['forward', 'backward', 'union'], default='forward', help='the direction for dictionary induction (defaults to forward)') self_learning_group.add_argument( '--numerals', action='store_true', help= 'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary' ) self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--validation', default=None, help='a dictionary file for validation at each iteration') self_learning_group.add_argument( '--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') advanced_group = parser.add_argument_group( 'advanced mapping arguments', 'Advanced embedding mapping arguments (AAAI 2018)') advanced_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') advanced_group.add_argument( '--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') advanced_group.add_argument( '--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') advanced_group.add_argument( '--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') advanced_group.add_argument( '--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') advanced_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary src_indices = [] trg_indices = [] if args.numerals: if args.dictionary != sys.stdin.fileno(): print('WARNING: Using numerals instead of the training dictionary', file=sys.stderr) numeral_regex = re.compile('^[0-9]+$') src_numerals = { word for word in src_words if numeral_regex.match(word) is not None } trg_numerals = { word for word in trg_words if numeral_regex.match(word) is not None } numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # STEP 0: Normalization for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) # Training loop prev_objective = objective = -100. it = 1 t = time.time() while it == 1 or objective - prev_objective >= args.threshold: # Update the embedding mapping if args.orthogonal: # orthogonal mapping u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) xw = x.dot(w) zw = z elif args.unconstrained: # unconstrained mapping x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot( x[src_indices])).dot(x[src_indices].T) w = x_pseudoinv.dot(z[trg_indices]) xw = x.dot(w) zw = z else: # advanced mapping xw = x zw = z # STEP 1: Whitening def whitening_transformation(m): u, s, vt = xp.linalg.svd(m, full_matrices=False) return vt.T.dot(xp.diag(1 / s)).dot(vt) if args.whiten: wx1 = whitening_transformation(xw[src_indices]) wz1 = whitening_transformation(zw[trg_indices]) xw = xw.dot(wx1) zw = zw.dot(wz1) # STEP 2: Orthogonal mapping wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot( zw[trg_indices])) wz2 = wz2_t.T xw = xw.dot(wx2) zw = zw.dot(wz2) # STEP 3: Re-weighting xw *= s**args.src_reweight zw *= s**args.trg_reweight # STEP 4: De-whitening if args.src_dewhiten == 'src': xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.src_dewhiten == 'trg': xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) if args.trg_dewhiten == 'src': zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.trg_dewhiten == 'trg': zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) # STEP 5: Dimensionality reduction if args.dim_reduction > 0: xw = xw[:, :args.dim_reduction] zw = zw[:, :args.dim_reduction] # Self-learning if args.self_learning: # Update the training dictionary best_sim_forward = xp.full(x.shape[0], -100, dtype=dtype) src_indices_forward = xp.arange(x.shape[0]) trg_indices_forward = xp.zeros(x.shape[0], dtype=int) best_sim_backward = xp.full(z.shape[0], -100, dtype=dtype) src_indices_backward = xp.zeros(z.shape[0], dtype=int) trg_indices_backward = xp.arange(z.shape[0]) for i in range(0, x.shape[0], MAX_DIM_X): j = min(x.shape[0], i + MAX_DIM_X) for k in range(0, z.shape[0], MAX_DIM_Z): l = min(z.shape[0], k + MAX_DIM_Z) sim = xw[i:j].dot(zw[k:l].T) if args.direction in ('forward', 'union'): ind = sim.argmax(axis=1) val = sim[xp.arange(sim.shape[0]), ind] ind += k mask = (val > best_sim_forward[i:j]) best_sim_forward[i:j][mask] = val[mask] trg_indices_forward[i:j][mask] = ind[mask] if args.direction in ('backward', 'union'): ind = sim.argmax(axis=0) val = sim[ind, xp.arange(sim.shape[1])] ind += i mask = (val > best_sim_backward[k:l]) best_sim_backward[k:l][mask] = val[mask] src_indices_backward[k:l][mask] = ind[mask] if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = xp.concatenate( (src_indices_forward, src_indices_backward)) trg_indices = xp.concatenate( (trg_indices_forward, trg_indices_backward)) # Objective function evaluation prev_objective = objective if args.direction == 'forward': objective = xp.mean(best_sim_forward).tolist() elif args.direction == 'backward': objective = xp.mean(best_sim_backward).tolist() elif args.direction == 'union': objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) sim = xw[src].dot(zw.T) # TODO Assuming that it fits in memory nn = asnumpy(sim.argmax(axis=1)) accuracy = np.mean([ 1 if nn[i] in validation[src[i]] else 0 for i in range(len(src)) ]) similarity = np.mean([ max([sim[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src)) ]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format( 100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format( it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, zw, trgfile) srcfile.close() trgfile.close()
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Evaluate embeddings in word analogy') parser.add_argument('embeddings', help='the word embeddings') parser.add_argument( '-t', '--threshold', type=int, default=0, help= 'reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30,000)' ) parser.add_argument('-i', '--input', default=sys.stdin.fileno(), help='the test file (defaults to stdin)') parser.add_argument('-v', '--verbose', action='store_true', help='verbose output (give category specific results)') parser.add_argument('-l', '--lowercase', action='store_true', help='lowercase the words in the test file') parser.add_argument( '--encoding', default='utf-8', action='store_true', help='the character encoding for input/output (defaults to utf-8)') args = parser.parse_args() # Read input embeddings f = open(args.embeddings, encoding=args.encoding, errors='surrogateescape') words, matrix = embeddings.read(f, threshold=args.threshold) # Length normalize embeddings matrix = embeddings.length_normalize(matrix) # Build word to index map word2ind = {word: i for i, word in enumerate(words)} # Compute accuracy and coverage and print results category = category_name = None semantic = {'correct': 0, 'total': 0, 'oov': 0} syntactic = {'correct': 0, 'total': 0, 'oov': 0} f = open(args.input, encoding=args.encoding, errors='surrogateescape') for line in f: if line.startswith(': '): if args.verbose and category is not None: print('Coverage:{0:7.2%} Accuracy:{1:7.2%} | {2}'.format( category['total'] / (category['total'] + category['oov']), category['correct'] / category['total'], category_name)) category_name = line[2:-1] current = syntactic if category_name.startswith( 'gram') else semantic category = {'correct': 0, 'total': 0, 'oov': 0} else: try: src1, trg1, src2, trg2 = [ word2ind[word.lower() if args.lowercase else word] for word in line.split() ] similarities = np.dot( matrix, matrix[src2] - matrix[src1] + matrix[trg1]) similarities[[src1, trg1, src2]] = -1 closest = np.argmax(similarities) if closest == trg2: category['correct'] += 1 current['correct'] += 1 category['total'] += 1 current['total'] += 1 except KeyError: category['oov'] += 1 current['oov'] += 1 if args.verbose: print('Coverage:{0:7.2%} Accuracy:{1:7.2%} | {2}'.format( category['total'] / (category['total'] + category['oov']), category['correct'] / category['total'], category_name)) print('-' * 80) print('Coverage:{0:7.2%} Accuracy:{1:7.2%} (sem:{2:7.2%}, syn:{3:7.2%})'. format((semantic['total'] + syntactic['total']) / (semantic['total'] + syntactic['total'] + semantic['oov'] + syntactic['oov']), (semantic['correct'] + syntactic['correct']) / (semantic['total'] + syntactic['total']), semantic['correct'] / semantic['total'], syntactic['correct'] / syntactic['total']))
def evaluate(src_emb_fname, tgt_emb_fname, dict_fname, max_voc=0, retrieval_method="csls", csls_k=10, batch_size=2500): print('Loading train data...') srcfile = open(src_emb_fname, 'r', encoding='utf-8', errors='surrogateescape') tgtfile = open(tgt_emb_fname, 'r', encoding='utf-8', errors='surrogateescape') # Read source embeddings src_words, x = embeddings.read(srcfile, max_voc=max_voc, dtype='float32') src_word2ind = {word: i for i, word in enumerate(src_words)} # Read target embeddings tgt_words, z = embeddings.read(tgtfile, max_voc=max_voc, dtype='float32') tgt_word2ind = {word: i for i, word in enumerate(tgt_words)} srcfile.close() tgtfile.close() xw = embeddings.length_normalize(x) zw = embeddings.length_normalize(z) # Loading test dictionary f = open(dict_fname, encoding='utf-8', errors='surrogateescape') src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() ### get translations translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) if retrieval_method == 'nn': # Standard nearest neighbor for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) nn = similarities.argmax(axis=1).tolist() similarities_idx = similarities.argsort(axis=1) nn5 = similarities_idx[:, -5:] nn10 = similarities_idx[:, -10:] for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] elif retrieval_method == 'csls': t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) print('Computing X Neighbourhood') # batch_size=1000 for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) # similarities_x = np.sort(similarities, axis=1) similarities_x = -1 * np.partition( -1 * similarities, csls_k - 1, axis=1) #similarities_x = -1*cp.partition(-1*cp.dot(cp.asarray(xw[src[i:j]]),cp.transpose(cp.asarray(zw))),csls_k-1 ,axis=1)[:,:csls_k] nbrhood_x[src[i:j]] = np.mean(similarities_x[:, :csls_k], axis=1) print('Completed in {0} seconds'.format(time.time() - t)) print('Computing Z Neighbourhood') batch_num = 1 for i in range(0, zw.shape[0], batch_size): j = min(i + batch_size, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), csls_k - 1, axis=1)[:, :csls_k] nbrhood_z2[i:j] = (cp.mean(similarities[:, :csls_k], axis=1)) print('Completed batch {0} in {1}'.format(batch_num, time.time() - t)) batch_num += 1 # gc.collect() # t=time.time() nbrhood_z = cp.asnumpy(nbrhood_z2) # ipdb.set_trace() print(time.time() - t) csls_alpha = 1 for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - csls_alpha * nbrhood_x[src[i:j]]) - csls_alpha * nbrhood_z nn = similarities.argmax(axis=1).tolist() print(time.time() - t) similarities = np.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] print('Completed in {0} seconds'.format(time.time() - t)) #### write the translations (1 pair per line format) #with open(trans_tgt_fname, 'w', encoding='utf-8', errors='surrogateescape') as trans_tgt_file: # for w in trans_words: # trans='' # if w in src_word2ind: # trans=tgt_words[translation[src_word2ind[w]]] # trans_tgt_file.write('{}\t{}\n'.format(w,trans)) # evaluation metrics accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean = 0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy5 = mean mean = 0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy10 = mean print( 'Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}' .format(coverage, accuracy, accuracy5, accuracy10))
if i % 5 == 4: print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 5),end='\n') #print('Cos loss: {}, reconstruction_loss:{}'.format(l1, l3)) print('Cos loss: {}, regluar_loss:{}, reconstruction_loss:{}'.format(l1,l2,l3)) running_loss = 0.0 # print(net.view1_fc.weight.grad) # print(net.view1_AE.encode_layer_0.weight.data) print('Cos loss: {}, regluar_loss:{}, reconstruction_loss:{}'.format(l1, l2, l3)) source_file = open('new_embedding_size640.en', encoding='utf-8', errors='surrogateescape') target_file = open('new_embedding_size640.de', encoding='utf-8', errors='surrogateescape') en_words, en_vec = embeddings.read(source_file) de_words, de_vec = embeddings.read(target_file) en_vec = embeddings.length_normalize(en_vec) de_vec = embeddings.length_normalize(de_vec) input_view1, input_view2 = Variable(torch.from_numpy(en_vec).cuda()), Variable(torch.from_numpy(de_vec).cuda()) res_envec, x1, res_devec, x2 = net(input_view1.float(), input_view2.float()) print(x1) src_file = open('BiAE.en', mode='w', encoding='utf-8', errors='surrogateescape') trg_file = open('BiAE.de', mode='w', encoding='utf-8', errors='surrogateescape') # res_envec = embeddings.length_normalize(res_envec.data.cpu().numpy()) # res_devec = embeddings.length_normalize(res_devec.data.cpu().numpy()) res_envec = (res_envec.data.cpu().numpy()) res_devec = (res_devec.data.cpu().numpy())
def translate_topn(words_to_translate, src_emb_info, tgt_emb_info, retrieval_method="csls", topn=5, csls_k=10, batch_size=2500): """ The top-n are not necessarily sorted, but the scores can be used to retrieve the sorted top-k candidates Only the 'csls' search implementation is complete """ # Read source embeddings src_words, x = src_emb_info src_word2ind = build_w2i(src_words) # Read target embeddings tgt_words, z = tgt_emb_info tgt_word2ind = build_w2i(tgt_words) xw = embeddings.length_normalize(x) zw = embeddings.length_normalize(z) all_words = [] trans_words = [] trans_idx = [] oov = set() for w in words_to_translate: try: all_words.append(w) w_ind = src_word2ind[w] trans_words.append(w) trans_idx.append(w_ind) except KeyError: oov.add(w) print(len(all_words)) print(len(trans_words)) print(len(trans_idx)) print(len(oov)) src = trans_idx translation_topn = collections.defaultdict(list) translation_topn_prob = collections.defaultdict(list) if retrieval_method == 'nn': # Standard nearest neighbor for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_idx = similarities.argsort(axis=1) similarities_scores = np.sort(similarities, axis=1) nn_topn = similarities_idx[:, -topn:] sim_unnorm = np.exp(similarities_scores[:, -topn:]) sim_total = np.sum(sim_unnorm, axis=1).reshape( (sim_unnorm.shape[0], 1)) # sim_unnorm has same first dimension as sim_total nn_topn_logprob = np.log(sim_unnorm / sim_total) ## softmax log probabilities for k in range(j - i): translation_topn[src[i + k]] = nn_topn[k] translation_topn_logprob[src[i + k]] = nn_topn_logprob[k] elif retrieval_method == 'csls': t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) print('Computing X Neighbourhood') # batch_size=1000 batch_num = 1 for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) # similarities_x = np.sort(similarities, axis=1) similarities_x = -1 * np.partition( -1 * similarities, csls_k - 1, axis=1) #similarities_x = -1*cp.partition(-1*cp.dot(cp.asarray(xw[src[i:j]]),cp.transpose(cp.asarray(zw))),csls_k-1 ,axis=1)[:,:csls_k] nbrhood_x[src[i:j]] = np.mean(similarities_x[:, :csls_k], axis=1) print('Completed batch {0} in {1}'.format(batch_num, time.time() - t)) batch_num += 1 print('Completed in {0} seconds'.format(time.time() - t)) print('Computing Z Neighbourhood') batch_num = 1 for i in range(0, zw.shape[0], batch_size): j = min(i + batch_size, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), csls_k - 1, axis=1)[:, :csls_k] nbrhood_z2[i:j] = (cp.mean(similarities[:, :csls_k], axis=1)) print('Completed batch {0} in {1}'.format(batch_num, time.time() - t)) batch_num += 1 # gc.collect() # t=time.time() nbrhood_z = cp.asnumpy(nbrhood_z2) # ipdb.set_trace() print(time.time() - t) csls_alpha = 1 for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose(np.transpose(2*similarities) - \ csls_alpha*nbrhood_x[src[i:j]])- \ csls_alpha*nbrhood_z similarities_idx = -1 * np.argpartition( -1 * similarities, topn - 1, axis=1) nn_topn = similarities_idx[:, -topn:] row_x = np.tile(np.array(range(topn)), (similarities_idx.shape[0], 1)) print('Shapes') print(similarities.shape) print(similarities_idx.shape) similarities_scores = similarities[row_x, nn_topn] sim_unnorm = np.exp(similarities_scores) # similarities_idx = similarities.argsort(axis=1) # similarities_scores = np.sort(similarities,axis=1) # print(time.time()-t) # nn_topn = similarities_idx[:,-topn:] # sim_unnorm = np.exp(similarities_scores[:,-topn:]) sim_total = np.sum(sim_unnorm, axis=1).reshape( (sim_unnorm.shape[0], 1)) # sim_unnorm has same first dimension as sim_total # nn_topn_logprob=np.log(sim_unnorm/sim_total) ## softmax log probabilities nn_topn_prob = sim_unnorm / sim_total ## softmax log probabilities for k in range(j - i): translation_topn[src[i + k]] = nn_topn[k] translation_topn_prob[src[i + k]] = nn_topn_prob[k] print('Completed in {0} seconds'.format(time.time() - t)) # get translations trans_pairs = [] for w in trans_words: if w in src_word2ind: srcid = src_word2ind[w] trans = [(tgt_words[translation_topn[srcid][r]], translation_topn_prob[srcid][r]) for r in range(topn)] trans_pairs.append((w, trans)) return dict(trans_pairs)
if i % 5 == 4: print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 5)) running_loss = 0.0 # print(net.view1_fc.weight.grad) source_file = open('new_embedding_size200.en', encoding='utf-8', errors='surrogateescape') target_file = open('new_embedding_size200.de', encoding='utf-8', errors='surrogateescape') en_words, en_vec = embeddings.read(source_file) de_words, de_vec = embeddings.read(target_file) en_vec = embeddings.length_normalize(en_vec) de_vec = embeddings.length_normalize(de_vec) input_view1, input_view2 = Variable( torch.from_numpy(en_vec).cuda()), Variable( torch.from_numpy(de_vec).cuda()) res_envec = net(input_view1.float()) src_file = open('LinearMappingres.en', mode='w', encoding='utf-8', errors='surrogateescape') trg_file = open('LinearMappingres.de', mode='w', encoding='utf-8',
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('mid_input', help='the input pivot embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument( '--max_vocab', default=0, type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary') parser.add_argument('--verbose', default=0, type=int, help='Verbose') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments') mapping_group.add_argument( '-dtrain1', '--dictionary_train1', default=sys.stdin.fileno(), help='the first training dictionary file (defaults to stdin)') mapping_group.add_argument( '-dtrain2', '--dictionary_train2', default=sys.stdin.fileno(), help='the second training dictionary file (defaults to stdin)') mapping_group.add_argument( '-dtest', '--dictionary_test', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') geomm_group = parser.add_argument_group('GeoMM arguments', 'Arguments for GeoMM method') geomm_group.add_argument('--l2_reg', type=float, default=1e2, help='Lambda for L2 Regularization') geomm_group.add_argument( '--max_opt_time', type=int, default=5000, help='Maximum time limit for optimization in seconds') geomm_group.add_argument( '--max_opt_iter', type=int, default=150, help='Maximum number of iterations for optimization') eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation') eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time') eval_group.add_argument('--eval_batch_size', type=int, default=1000, help='Batch size for evaluation') eval_group.add_argument('--csls_neighbourhood', type=int, default=10, help='Neighbourhood size for CSLS') args = parser.parse_args() BATCH_SIZE = args.eval_batch_size # Logging method_name = os.path.join('logs', 'geomm_cmp_pip') directory = os.path.join( os.path.join(os.getcwd(), method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) if not os.path.exists(directory): os.makedirs(directory) log_file_name, file_extension = os.path.splitext( os.path.basename(args.dictionary_test)) log_file_name = log_file_name + '.log' class Logger(object): def __init__(self): self.terminal = sys.stdout self.log = open(os.path.join(directory, log_file_name), "a") def write(self, message): self.terminal.write(message) self.log.write(message) def flush(self): #this flush method is needed for python 3 compatibility. #this handles the flush command by doing nothing. #you might want to specify some extra behavior here. pass sys.stdout = Logger() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading train data...') # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') midfile = open(args.mid_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, max_voc=args.max_vocab, dtype=dtype) mid_words, y = embeddings.read(midfile, max_voc=args.max_vocab, dtype=dtype) trg_words, z = embeddings.read(trgfile, max_voc=args.max_vocab, dtype=dtype) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} mid_word2ind = {word: i for i, word in enumerate(mid_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary-1 src_indices12 = [] trg_indices12 = [] f = open(args.dictionary_train1, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = mid_word2ind[trg] src_indices12.append(src_ind) trg_indices12.append(trg_ind) except KeyError: if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) f.close() # Build training dictionary-2 src_indices23 = [] trg_indices23 = [] f = open(args.dictionary_train2, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = mid_word2ind[src] trg_ind = trg_word2ind[trg] src_indices23.append(src_ind) trg_indices23.append(trg_ind) except KeyError: if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) f.close() if args.verbose: print('Normalizing embeddings...') # STEP 0: Normalization for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) y = embeddings.length_normalize(y) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) y = embeddings.mean_center(y) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) y = embeddings.length_normalize_dimensionwise(y) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) y = embeddings.mean_center_embeddingwise(y) z = embeddings.mean_center_embeddingwise(z) # Step 1.1: Optimization-1 if args.verbose: print('Beginning Optimization-1') start_time = time.time() x_count = len(set(src_indices12)) y_count = len(set(trg_indices12)) A = np.zeros((x_count, y_count)) # Creating dictionary matrix from training set map_dict_src = {} map_dict_trg = {} I = 0 uniq_src = [] uniq_trg = [] for i in range(len(src_indices12)): if src_indices12[i] not in map_dict_src.keys(): map_dict_src[src_indices12[i]] = I I += 1 uniq_src.append(src_indices12[i]) J = 0 for j in range(len(trg_indices12)): if trg_indices12[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices12[j]] = J J += 1 uniq_trg.append(trg_indices12[j]) for i in range(len(src_indices12)): A[map_dict_src[src_indices12[i]], map_dict_trg[trg_indices12[i]]] = 1 np.random.seed(0) Lambda = args.l2_reg U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() cost = TT.sum(((shared(x[uniq_src]).dot(U1.dot(B.dot(U2.T)))).dot( shared(y[uniq_trg]).T) - A)**2) + 0.5 * Lambda * (TT.sum(B**2)) solver = ConjugateGradient(maxtime=args.max_opt_time, maxiter=args.max_opt_iter) low_rank = 300 manifold = Product([ Stiefel(x.shape[1], low_rank), Stiefel(y.shape[1], low_rank), PositiveDefinite(low_rank) ]) problem = Problem(manifold=manifold, cost=cost, arg=[U1, U2, B], verbosity=3) wopt = solver.solve(problem) w = wopt U1 = w[0] U2 = w[1] B = w[2] w12 = U1.dot(B).dot(U2.T) u11 = U1 u21 = U2 b1 = B # Step 1.2: Optimization-2 if args.verbose: print('Beginning Optimization-2') y_count = len(set(src_indices23)) z_count = len(set(trg_indices23)) A = np.zeros((y_count, z_count)) # Creating dictionary matrix from training set map_dict_src = {} map_dict_trg = {} I = 0 uniq_src = [] uniq_trg = [] for i in range(len(src_indices23)): if src_indices23[i] not in map_dict_src.keys(): map_dict_src[src_indices23[i]] = I I += 1 uniq_src.append(src_indices23[i]) J = 0 for j in range(len(trg_indices23)): if trg_indices23[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices23[j]] = J J += 1 uniq_trg.append(trg_indices23[j]) for i in range(len(src_indices23)): A[map_dict_src[src_indices23[i]], map_dict_trg[trg_indices23[i]]] = 1 np.random.seed(0) U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() cost = TT.sum(((shared(y[uniq_src]).dot(U1.dot(B.dot(U2.T)))).dot( shared(z[uniq_trg]).T) - A)**2) + 0.5 * Lambda * (TT.sum(B**2)) solver = ConjugateGradient(maxtime=args.max_opt_time, maxiter=args.max_opt_iter) low_rank = 300 manifold = Product([ Stiefel(y.shape[1], low_rank), Stiefel(z.shape[1], low_rank), PositiveDefinite(low_rank) ]) problem = Problem(manifold=manifold, cost=cost, arg=[U1, U2, B], verbosity=3) wopt = solver.solve(problem) w = wopt U1 = w[0] U2 = w[1] B = w[2] w23 = U1.dot(B).dot(U2.T) u22 = U1 u32 = U2 b2 = B # Step 2: Transformation w12_1 = u11.dot(scipy.linalg.sqrtm(b1)) w12_2 = u21.dot(scipy.linalg.sqrtm(b1)) w23_1 = u22.dot(scipy.linalg.sqrtm(b2)) w23_2 = u32.dot(scipy.linalg.sqrtm(b2)) end_time = time.time() if args.verbose: print('Completed training in {0:.2f} seconds'.format(end_time - start_time)) gc.collect() # Step 3: Evaluation # Loading test dictionary f = open(args.dictionary_test, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() # Composition (CMP) xw = x.dot(w12).dot(w23) zw = z if args.normalize_eval: xw = embeddings.length_normalize(xw) zw = embeddings.length_normalize(zw) translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_x = -1 * np.partition( -1 * similarities, args.csls_neighbourhood - 1, axis=1) nbrhood_x[src[i:j]] = np.mean( similarities_x[:, :args.csls_neighbourhood], axis=1) batch_num = 1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z2[i:j] = (cp.mean(similarities[:, :args.csls_neighbourhood], axis=1)) batch_num += 1 nbrhood_z = cp.asnumpy(nbrhood_z2) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z nn = similarities.argmax(axis=1).tolist() similarities = np.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean = 0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy5 = mean mean = 0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy10 = mean print( 'CMP: Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}' .format(coverage, accuracy, accuracy5, accuracy10)) # Pipeline (PIP) xw = x.dot(w12_1) zw = y.dot(w12_2) if args.normalize_eval: xw = embeddings.length_normalize(xw) zw = embeddings.length_normalize(zw) translation12 = collections.defaultdict(int) # PIP-Stage 1 t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_x = -1 * np.partition( -1 * similarities, args.csls_neighbourhood - 1, axis=1) nbrhood_x[src[i:j]] = np.mean( similarities_x[:, :args.csls_neighbourhood], axis=1) batch_num = 1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z2[i:j] = (cp.mean(similarities[:, :args.csls_neighbourhood], axis=1)) batch_num += 1 nbrhood_z = cp.asnumpy(nbrhood_z2) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z nn = similarities.argmax(axis=1).tolist() for k in range(j - i): translation[src[i + k]] = nn[k] # PIP-Stage 2 mid = [translation[sr] for sr in src] xw = y.dot(w23_1) zw = z.dot(w23_2) if args.normalize_eval: xw = embeddings.length_normalize(xw) zw = embeddings.length_normalize(zw) translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) for i in range(0, len(mid), BATCH_SIZE): j = min(i + BATCH_SIZE, len(mid)) similarities = xw[mid[i:j]].dot(zw.T) # similarities_x = np.sort(similarities, axis=1) similarities_x = -1 * np.partition( -1 * similarities, args.csls_neighbourhood - 1, axis=1) nbrhood_x[mid[i:j]] = np.mean( similarities_x[:, :args.csls_neighbourhood], axis=1) batch_num = 1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z2[i:j] = (cp.mean(similarities[:, :args.csls_neighbourhood], axis=1)) batch_num += 1 nbrhood_z = cp.asnumpy(nbrhood_z2) for i in range(0, len(mid), BATCH_SIZE): j = min(i + BATCH_SIZE, len(mid)) similarities = xw[mid[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - nbrhood_x[mid[i:j]]) - nbrhood_z nn = similarities.argmax(axis=1).tolist() similarities = np.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean = 0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy5 = mean mean = 0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy10 = mean print( 'PIP: Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}' .format(coverage, accuracy, accuracy5, accuracy10))
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument( '--max_vocab', default=0, type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary') parser.add_argument('--verbose', default=0, type=int, help='Verbose') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments') mapping_group.add_argument( '-dtrain', '--dictionary_train', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument( '-dtest', '--dictionary_test', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') mapping_group.add_argument( '-dtrainspl', '--dictionary_trainspl', default=sys.stdin.fileno(), help='the training dictionary split file (defaults to stdin)') mapping_group.add_argument( '-dvalspl', '--dictionary_valspl', default=sys.stdin.fileno(), help='the validation dictionary split file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') geomm_group = parser.add_argument_group('GeoMM arguments', 'Arguments for GeoMM method') geomm_group.add_argument('--l2_reg', type=float, default=1e-1, help='Lambda for L2 Regularization') geomm_group.add_argument( '--max_opt_time', type=int, default=5000, help='Maximum time limit for optimization in seconds') geomm_group.add_argument( '--max_opt_iter', type=int, default=150, help='Maximum number of iterations for optimization') geomm_group.add_argument( '--x_cutoff', type=int, default=25000, help='Vocabulary cutoff for first language for bootstrapping') geomm_group.add_argument( '--z_cutoff', type=int, default=25000, help='Vocabulary cutoff for second language for bootstrapping') geomm_group.add_argument( '--patience', type=int, default=1, help= 'Number of iterations with a decrease in validation accuracy permissible during bootstrapping' ) eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation') eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time') eval_group.add_argument('--eval_batch_size', type=int, default=500, help='Batch size for evaluation') eval_group.add_argument('--csls_neighbourhood', type=int, default=10, help='Neighbourhood size for CSLS') args = parser.parse_args() BATCH_SIZE = args.eval_batch_size # Logging method_name = os.path.join('logs', 'geomm_semi') directory = os.path.join( os.path.join(os.getcwd(), method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) if not os.path.exists(directory): os.makedirs(directory) log_file_name, file_extension = os.path.splitext( os.path.basename(args.dictionary_train)) log_file_name = log_file_name + '.log' class Logger(object): def __init__(self): self.terminal = sys.stdout self.log = open(os.path.join(directory, log_file_name), "a") def write(self, message): self.terminal.write(message) self.log.write(message) def flush(self): #this flush method is needed for python 3 compatibility. #this handles the flush command by doing nothing. #you might want to specify some extra behavior here. pass sys.stdout = Logger() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading train data...') # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, max_voc=args.max_vocab, dtype=dtype) trg_words, z = embeddings.read(trgfile, max_voc=args.max_vocab, dtype=dtype) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary src_indices = [] trg_indices = [] f = open(args.dictionary_train, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) f.close() src_indices = src_indices trg_indices = trg_indices src_indices_train = list(src_indices) trg_indices_train = list(trg_indices) src_indices = [] trg_indices = [] # Loading train-split dictionary f = open(args.dictionary_trainspl, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) f.close() if args.verbose: print('Normalizing embeddings...') # STEP 0: Normalization for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) orig_src = src_indices orig_trg = trg_indices best_val_acc = 0 best_add_src = [] best_add_trg = [] add_src = [] add_trg = [] if args.verbose: print('Beginning Optimization') start_time = time.time() it_count = 0 drop_count = 0 # Bootstrap loop while True: if args.verbose: print('Starting bootstrap iteration {0}'.format(it_count + 1)) # Step 1.1: Optimization x_count = len(set(src_indices)) z_count = len(set(trg_indices)) # Creating dictionary matrix from training set map_dict_src = {} map_dict_trg = {} I = 0 uniq_src = [] uniq_trg = [] for i in range(len(src_indices)): if src_indices[i] not in map_dict_src.keys(): map_dict_src[src_indices[i]] = I I += 1 uniq_src.append(src_indices[i]) J = 0 for j in range(len(trg_indices)): if trg_indices[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices[j]] = J J += 1 uniq_trg.append(trg_indices[j]) np.random.seed(0) Lambda = args.l2_reg U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() X_tot = x[uniq_src].T.dot(x[uniq_src]) Z_tot = z[uniq_trg].T.dot(z[uniq_trg]) W = U1.dot(B.dot(U2.T)) cost = (TT.nlinalg.trace( U2.dot( B.dot( U1.T.dot( shared(X_tot).dot( U1.dot(B.dot(U2.T.dot(shared(Z_tot))))))))) - 2 * TT.sum( (shared(x[src_indices]).dot(W)) * shared(z[trg_indices])) ) / (len(src_indices)) + 0.5 * Lambda * (TT.sum(B**2)) solver = ConjugateGradient(maxtime=args.max_opt_time, maxiter=args.max_opt_iter, mingradnorm=1e-15) low_rank = 300 manifold = Product([ Stiefel(x.shape[1], low_rank), Stiefel(z.shape[1], low_rank), PositiveDefinite(low_rank) ]) problem = Problem(manifold=manifold, cost=cost, arg=[U1, U2, B], verbosity=3) wopt = solver.solve(problem) w = wopt U1 = w[0] U2 = w[1] B = w[2] # Step 1.2: Transformation xw = x.dot(U1).dot(scipy.linalg.sqrtm(B)) zw = z.dot(U2).dot(scipy.linalg.sqrtm(B)) it_count += 1 # Step 1.3: Compute Validation Accuracy if args.normalize_eval: xw = embeddings.length_normalize(xw) zw = embeddings.length_normalize(zw) # Loading validation dictionary f = open(args.dictionary_valspl, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) t = time.time() nbrhood_x = cp.zeros(xw.shape[0]) nbrhood_z = cp.zeros(zw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(xw[src[i:j]]), cp.transpose(cp.asarray(zw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_x[src[i:j]] = (cp.mean(similarities, axis=1)) for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z[i:j] = (cp.mean(similarities, axis=1)) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = cp.transpose( cp.transpose(2 * cp.asarray(xw[src[i:j]]).dot( cp.transpose(cp.asarray(zw)))) - nbrhood_x[src[i:j]]) - nbrhood_z nn = cp.argmax(similarities, axis=1).tolist() similarities = cp.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k].tolist() translation10[src[i + k]] = nn10[k].tolist() accuracy = np.mean( [1 if translation[i] in src2trg[i] else 0 for i in src]) mean = 0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy5 = mean mean = 0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy10 = mean drop_count += 1 if accuracy > best_val_acc: if args.verbose: print('Improvement of {0}% over best validation accuracy!'. format((accuracy - best_val_acc) * 100)) best_val_acc = accuracy best_add_src = list(add_src) best_add_trg = list(add_trg) drop_count = 0 if args.verbose: print( 'Val Set:- Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}' .format(coverage, accuracy, accuracy5, accuracy10)) if drop_count >= args.patience: if args.verbose: print('Training ended') break # Step 1.4: Dictionary Induction Stage (Bootstrap) # Consider x_cutoff and z_cutoff to be the vocabulary of the two languages(First k words of vocabulary are the most frequent words in the language(as per standard word embeddings)). # CSLS Inferencing will be performed on this vocabulary subset. Bidirectional bootstrapping is performed. # Dictionary entries for first "x_cutoff" words of Language-1 and for first "z-cutoff" words of Language-2 are inferred. Original training dictionary is also added. # Total dictionary size=x_cutoff+z_cutoff+size(train_set) if args.normalize_eval: xw = embeddings.length_normalize(xw) zw = embeddings.length_normalize(zw) x_vocab_size = min(xw.shape[0], args.x_cutoff) z_vocab_size = min(zw.shape[0], args.z_cutoff) t = time.time() nbrhood_x = cp.zeros(x_vocab_size) best_sim_x = cp.zeros(x_vocab_size) best_sim_x_csls = cp.zeros(x_vocab_size) nbrhood_z = cp.zeros(z_vocab_size) batch_num = 1 for i in range(0, x_vocab_size, BATCH_SIZE): j = min(i + BATCH_SIZE, x_vocab_size) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(xw[i:j]), cp.transpose(cp.asarray(zw[:z_vocab_size]))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_x[i:j] = (cp.mean(similarities, axis=1)) best_sim_x[i:j] = (cp.max(similarities, axis=1)) batch_num += 1 batch_num = 1 for i in range(0, z_vocab_size, BATCH_SIZE): j = min(i + BATCH_SIZE, z_vocab_size) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw[:x_vocab_size]))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z[i:j] = (cp.mean(similarities, axis=1)) batch_num += 1 src_indices = list(range(0, x_vocab_size)) trg_indices = [] batch_num = 1 for i in range(0, x_vocab_size, BATCH_SIZE): j = min(i + BATCH_SIZE, x_vocab_size) similarities = cp.transpose( cp.transpose(2 * cp.asarray(xw[i:j]).dot( cp.transpose(cp.asarray(zw[:z_vocab_size])))) - nbrhood_x[i:j]) - nbrhood_z nn = cp.argmax(similarities, axis=1).tolist() trg_indices.append(nn) batch_num += 1 src_indices2 = [] trg_indices2 = list(range(0, z_vocab_size)) batch_num = 1 for i in range(0, z_vocab_size, BATCH_SIZE): j = min(i + BATCH_SIZE, z_vocab_size) similarities = cp.transpose( cp.transpose(2 * cp.asarray(zw[i:j]).dot( cp.transpose(cp.asarray(xw[:x_vocab_size])))) - nbrhood_z[i:j]) - nbrhood_x nn = cp.argmax(similarities, axis=1).tolist() src_indices2.append(nn) batch_num += 1 trg_indices = [item for sublist in trg_indices for item in sublist] src_indices2 = [item for sublist in src_indices2 for item in sublist] add_src = list(src_indices + src_indices2) add_trg = list(trg_indices + trg_indices2) src_indices = src_indices + src_indices2 + orig_src trg_indices = trg_indices + trg_indices2 + orig_trg end_time = time.time() if args.verbose: print('Completed bootstrapping in {0:.2f} seconds'.format(end_time - start_time)) # Step 2: Final Training with bootstrapped dictionary if args.verbose: print('Training final model') src_indices = best_add_src + src_indices_train trg_indices = best_add_trg + trg_indices_train x_count = len(set(src_indices)) z_count = len(set(trg_indices)) # Creating dictionary matrix from training set map_dict_src = {} map_dict_trg = {} I = 0 uniq_src = [] uniq_trg = [] for i in range(len(src_indices)): if src_indices[i] not in map_dict_src.keys(): map_dict_src[src_indices[i]] = I I += 1 uniq_src.append(src_indices[i]) J = 0 for j in range(len(trg_indices)): if trg_indices[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices[j]] = J J += 1 uniq_trg.append(trg_indices[j]) np.random.seed(0) Lambda = args.l2_reg U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() X_tot = x[uniq_src].T.dot(x[uniq_src]) Z_tot = z[uniq_trg].T.dot(z[uniq_trg]) W = U1.dot(B.dot(U2.T)) cost = (TT.nlinalg.trace( U2.dot( B.dot( U1.T.dot( shared(X_tot).dot(U1.dot(B.dot(U2.T.dot(shared(Z_tot))))))) )) - 2 * TT.sum( (shared(x[src_indices]).dot(W)) * shared(z[trg_indices])) ) / len(src_indices) + 0.5 * Lambda * (TT.sum(B**2)) solver = ConjugateGradient(maxtime=args.max_opt_time, maxiter=args.max_opt_iter) low_rank = 300 manifold = Product([ Stiefel(x.shape[1], low_rank), Stiefel(z.shape[1], low_rank), PositiveDefinite(low_rank) ]) problem = Problem(manifold=manifold, cost=cost, arg=[U1, U2, B], verbosity=3) wopt = solver.solve(problem) w = wopt U1 = w[0] U2 = w[1] B = w[2] xw = x.dot(U1).dot(scipy.linalg.sqrtm(B)) zw = z.dot(U2).dot(scipy.linalg.sqrtm(B)) gc.collect() # Step 3: Evaluation if args.verbose: print('Beginning Evaluation') if args.normalize_eval: xw = embeddings.length_normalize(xw) zw = embeddings.length_normalize(zw) # Loading test dictionary f = open(args.dictionary_test, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_x = -1 * np.partition( -1 * similarities, args.csls_neighbourhood - 1, axis=1) nbrhood_x[src[i:j]] = np.mean( similarities_x[:, :args.csls_neighbourhood], axis=1) batch_num = 1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z2[i:j] = (cp.mean(similarities, axis=1)) batch_num += 1 nbrhood_z = cp.asnumpy(nbrhood_z2) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z nn = similarities.argmax(axis=1).tolist() similarities = np.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean = 0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy5 = mean mean = 0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy10 = mean print( 'Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}' .format(coverage, accuracy, accuracy5, accuracy10))
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Map the source embeddings into the target embedding space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('--model_path', default=None, type=str, help='directory to save the model') parser.add_argument('--geomm_embeddings_path', default=None, type=str, help='directory to save the output GeoMM latent space embeddings. The output embeddings are normalized.') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--max_vocab', default=0,type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary') parser.add_argument('--verbose', default=0,type=int, help='Verbose') mapping_group = parser.add_argument_group('mapping arguments', 'Basic embedding mapping arguments') mapping_group.add_argument('-dtrain', '--dictionary_train', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument('-dtest', '--dictionary_test', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') geomm_group = parser.add_argument_group('GeoMM arguments', 'Arguments for GeoMM method') geomm_group.add_argument('--l2_reg', type=float,default=1e2, help='Lambda for L2 Regularization') geomm_group.add_argument('--max_opt_time', type=int,default=5000, help='Maximum time limit for optimization in seconds') geomm_group.add_argument('--max_opt_iter', type=int,default=150, help='Maximum number of iterations for optimization') eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation') eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time') eval_group.add_argument('--eval_batch_size', type=int,default=1000, help='Batch size for evaluation') eval_group.add_argument('--csls_neighbourhood', type=int,default=10, help='Neighbourhood size for CSLS') args = parser.parse_args() BATCH_SIZE = args.eval_batch_size ## Logging #method_name = os.path.join('logs','geomm') #directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) #if not os.path.exists(directory): # os.makedirs(directory) #log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train)) #log_file_name = log_file_name + '.log' #class Logger(object): # def __init__(self): # self.terminal = sys.stdout # self.log = open(os.path.join(directory,log_file_name), "a") # def write(self, message): # self.terminal.write(message) # self.log.write(message) # def flush(self): # #this flush method is needed for python 3 compatibility. # #this handles the flush command by doing nothing. # #you might want to specify some extra behavior here. # pass #sys.stdout = Logger() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading train data...') # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile,max_voc=args.max_vocab, dtype=dtype) trg_words, z = embeddings.read(trgfile,max_voc=args.max_vocab, dtype=dtype) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary noov=0 src_indices = [] trg_indices = [] f = open(args.dictionary_train, encoding=args.encoding, errors='surrogateescape') for line in f: src,trg = line.split() if args.max_vocab: src=src.lower() trg=trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: noov+=1 if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg)) #, file=sys.stderr f.close() if args.verbose: print('Number of training pairs having at least one OOV: {}'.format(noov)) src_indices = src_indices trg_indices = trg_indices if args.verbose: print('Normalizing embeddings...') # STEP 0: Normalization for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) # Step 1: Optimization if args.verbose: print('Beginning Optimization') start_time = time.time() x_count = len(set(src_indices)) z_count = len(set(trg_indices)) A = np.zeros((x_count,z_count)) # Creating dictionary matrix from training set map_dict_src={} map_dict_trg={} I=0 uniq_src=[] uniq_trg=[] for i in range(len(src_indices)): if src_indices[i] not in map_dict_src.keys(): map_dict_src[src_indices[i]]=I I+=1 uniq_src.append(src_indices[i]) J=0 for j in range(len(trg_indices)): if trg_indices[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices[j]]=J J+=1 uniq_trg.append(trg_indices[j]) for i in range(len(src_indices)): A[map_dict_src[src_indices[i]],map_dict_trg[trg_indices[i]]]=1 np.random.seed(0) Lambda=args.l2_reg U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() Kx, Kz = x[uniq_src], z[uniq_trg] XtAZ = Kx.T.dot(A.dot(Kz)) XtX = Kx.T.dot(Kx) ZtZ = Kz.T.dot(Kz) # AA = np.sum(A*A) # this can be added if cost needs to be compared to original geomm W = (U1.dot(B)).dot(U2.T) regularizer = 0.5*Lambda*(TT.sum(B**2)) sXtX = shared(XtX) sZtZ = shared(ZtZ) sXtAZ = shared(XtAZ) cost = regularizer wtxtxw = W.T.dot(sXtX.dot(W)) wtxtxwztz = wtxtxw.dot(sZtZ) cost += TT.nlinalg.trace(wtxtxwztz) cost += -2 * TT.sum(W * sXtAZ) # cost += shared(AA) # this can be added if cost needs to be compared with original geomm solver = ConjugateGradient(maxtime=args.max_opt_time,maxiter=args.max_opt_iter) manifold =Product([Stiefel(x.shape[1], x.shape[1]),Stiefel(z.shape[1], x.shape[1]),PositiveDefinite(x.shape[1])]) #manifold =Product([Stiefel(x.shape[1], 200),Stiefel(z.shape[1], 200),PositiveDefinite(200)]) problem = Problem(manifold=manifold, cost=cost, arg=[U1,U2,B], verbosity=3) wopt = solver.solve(problem) w= wopt U1 = w[0] U2 = w[1] B = w[2] ### Save the models if requested if args.model_path is not None: os.makedirs(args.model_path,exist_ok=True) np.savetxt('{}/U_src.csv'.format(args.model_path),U1) np.savetxt('{}/U_tgt.csv'.format(args.model_path),U2) np.savetxt('{}/B.csv'.format(args.model_path),B) # Step 2: Transformation xw = x.dot(U1).dot(scipy.linalg.sqrtm(B)) zw = z.dot(U2).dot(scipy.linalg.sqrtm(B)) end_time = time.time() if args.verbose: print('Completed training in {0:.2f} seconds'.format(end_time-start_time)) gc.collect() ### Save the GeoMM embeddings if requested xw_n = embeddings.length_normalize(xw) zw_n = embeddings.length_normalize(zw) if args.geomm_embeddings_path is not None: os.makedirs(args.geomm_embeddings_path,exist_ok=True) out_emb_fname=os.path.join(args.geomm_embeddings_path,'src.vec') with open(out_emb_fname,'w',encoding=args.encoding) as outfile: embeddings.write(src_words,xw_n,outfile) out_emb_fname=os.path.join(args.geomm_embeddings_path,'trg.vec') with open(out_emb_fname,'w',encoding=args.encoding) as outfile: embeddings.write(trg_words,zw_n,outfile) # Step 3: Evaluation if args.normalize_eval: xw = xw_n zw = zw_n X = xw[src_indices] Z = zw[trg_indices] # Loading test dictionary f = open(args.dictionary_test, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src=src.lower() trg=trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) ### compute nearest neigbours of x in z t=time.time() nbrhood_x=np.zeros(xw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_x = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1) nbrhood_x[src[i:j]]=np.mean(similarities_x[:,:args.csls_neighbourhood],axis=1) ### compute nearest neigbours of z in x (GPU version) nbrhood_z=np.zeros(zw.shape[0]) with cp.cuda.Device(0): nbrhood_z2=cp.zeros(zw.shape[0]) batch_num=1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1*cp.partition(-1*cp.dot(cp.asarray(zw[i:j]),cp.transpose(cp.asarray(xw))),args.csls_neighbourhood-1 ,axis=1)[:,:args.csls_neighbourhood] nbrhood_z2[i:j]=(cp.mean(similarities[:,:args.csls_neighbourhood],axis=1)) batch_num+=1 nbrhood_z=cp.asnumpy(nbrhood_z2) #### compute nearest neigbours of z in x (CPU version) #nbrhood_z=np.zeros(zw.shape[0]) #for i in range(0, len(zw.shape[0]), BATCH_SIZE): # j = min(i + BATCH_SIZE, len(zw.shape[0])) # similarities = zw[i:j].dot(xw.T) # similarities_z = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1) # nbrhood_z[i:j]=np.mean(similarities_z[:,:args.csls_neighbourhood],axis=1) #### find translation #for i in range(0, len(src), BATCH_SIZE): # j = min(i + BATCH_SIZE, len(src)) # similarities = xw[src[i:j]].dot(zw.T) # similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z # nn = similarities.argmax(axis=1).tolist() # similarities = np.argsort((similarities),axis=1) # nn5 = (similarities[:,-5:]) # nn10 = (similarities[:,-10:]) # for k in range(j-i): # translation[src[i+k]] = nn[k] # translation5[src[i+k]] = nn5[k] # translation10[src[i+k]] = nn10[k] #if args.geomm_embeddings_path is not None: # delim=',' # os.makedirs(args.geomm_embeddings_path,exist_ok=True) # translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv') # with open(translations_fname,'w',encoding=args.encoding) as translations_file: # for src_id in src: # src_word = src_words[src_id] # all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ] # trgout_words = [ trg_words[j] for j in translation10[src_id] ] # ss = list(nn10[src_id,:]) # # p1 = ':'.join(all_trg_words) # p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] ) # translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, delim=delim, p1=p1, p2=p2) ) ### find translation (and write to file if output requested) delim=',' translations_file =None if args.geomm_embeddings_path is not None: os.makedirs(args.geomm_embeddings_path,exist_ok=True) translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv') translations_file = open(translations_fname,'w',encoding=args.encoding) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z nn = similarities.argmax(axis=1).tolist() similarities = np.argsort((similarities),axis=1) nn5 = (similarities[:,-5:]) nn10 = (similarities[:,-10:]) for k in range(j-i): translation[src[i+k]] = nn[k] translation5[src[i+k]] = nn5[k] translation10[src[i+k]] = nn10[k] if args.geomm_embeddings_path is not None: src_id=src[i+k] src_word = src_words[src_id] all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ] trgout_words = [ trg_words[j] for j in translation10[src_id] ] #ss = list(nn10[src_id,:]) p1 = ':'.join(all_trg_words) p2 = ':'.join(trgout_words) #p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] ) translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, p1=p1, p2=p2, delim=delim) ) if args.geomm_embeddings_path is not None: translations_file.close() accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean=0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean+=1 break mean/=len(src) accuracy5 = mean mean=0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean+=1 break mean/=len(src) accuracy10 = mean message = src_input.split(".")[-2] + "-->" + trg_input.split(".")[-2] + ":" 'Coverage:{0:7.2%} Accuracy:{1:7.2%}'.format(coverage, accuracy)
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description= 'Evaluate embeddings of two languages in a shared space in word translation induction' ) parser.add_argument('src_embeddings', help='the source language embeddings') parser.add_argument('trg_embeddings', help='the target language embeddings') parser.add_argument('-d', '--dictionary', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') parser.add_argument( '--retrieval', default='nn', choices=['nn', 'invnn', 'invsoftmax', 'csls', 'fcsls'], help= 'the retrieval method (nn: standard nearest neighbor; invnn: inverted nearest neighbor; invsoftmax: inverted softmax; csls: cross-domain similarity local scaling)' ) parser.add_argument( '--inv_temperature', default=1, type=float, help='the inverse temperature (only compatible with inverted softmax)') parser.add_argument( '--inv_sample', default=None, type=int, help= 'use a random subset of the source vocabulary for the inverse computations (only compatible with inverted softmax)' ) parser.add_argument( '--neighborhood', default=10, type=int, help='the neighborhood size (only compatible with csls)') parser.add_argument('--nbest', default=3, type=int, help='number of candidates to get') parser.add_argument( '--dot', action='store_true', help= 'use the dot product in the similarity computations instead of the cosine' ) parser.add_argument('--verbose', action='store_true', help='verbose, print more information') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--seed', type=int, default=0, help='the random seed') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--vocabulary_cutoff', default=0, type=int, help='vocab limit for reading the embedding') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') args = parser.parse_args() # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=args.vocabulary_cutoff) trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=args.vocabulary_cutoff) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np xp.random.seed(args.seed) # Length normalize embeddings so their dot product effectively computes the cosine similarity if not args.dot: embeddings.length_normalize(x) embeddings.length_normalize(z) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_ind2word = {i: word for i, word in enumerate(trg_words)} src_ind2word = {i: word for i, word in enumerate(src_words)} # Read dictionary and compute coverage f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') oov = set() vocab = set() src = [] for line in f: if '\t' in line: w, _ = line.split('\t') elif ' ' in line: w, _ = line.split(' ') else: w = line.strip() if w in vocab: continue try: src.append(src_word2ind[w]) vocab.add(w) except KeyError: oov.add(w) if args.verbose: print(f'{len(oov)} oovs: ' + '|'.join(list(oov)[:10]), file=sys.stderr) if args.retrieval == 'nn': # Standard nearest neighbor queries = x[src] topvals, topinds = embeddings.faiss_knn(queries, z, k=args.nbest) for i, wind in enumerate(src): w = src_ind2word[wind] for k, tind in enumerate(topinds[i]): wt = trg_ind2word[tind] st = topvals[i, k] print(f'{w}\t{wt}\t{st:.3f}') elif args.retrieval == 'fcsls': # Cross-domain similarity local scaling sim_bwd, _ = embeddings.faiss_knn(z, x, k=args.neighborhood) knn_sim_bwd = sim_bwd.mean(axis=1) queries = x[src] topvals, topinds = embeddings.faiss_knn(queries, z, k=30) for i, wind in enumerate(src): w = src_ind2word[wind] for k, tind in enumerate(topinds[i]): wt = trg_ind2word[tind] st = 2 * topvals[i, k] - knn_sim_bwd[topinds[i, k]] print(f'{w}\t{wt}\t{st:.3f}') elif args.retrieval == 'csls': # Cross-domain similarity local scaling sim_bwd, _ = embeddings.faiss_knn(z, x, k=args.neighborhood) knn_sim_bwd = sim_bwd.mean(axis=1) queries = x[src] for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = 2 * x[src[i:j]].dot( z.T) - knn_sim_bwd # Equivalent to the real CSLS scores for NN nn = (-similarities).argpartition(args.nbest, axis=1) for k in range(j - i): w = src_ind2word[src[i + k]] for tind in nn[k, :args.nbest]: wt = trg_ind2word[tind] st = similarities[k, tind] print(f'{w}\t{wt}\t{st:.3f}')
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Evaluate embeddings in word analogy') parser.add_argument('embeddings', help='the word embeddings') parser.add_argument( '-t', '--threshold', type=int, default=0, help= 'reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30,000)' ) parser.add_argument('-i', '--input', default=sys.stdin.fileno(), help='the test file (defaults to stdin)') parser.add_argument('-v', '--verbose', action='store_true', help='verbose output (give category specific results)') parser.add_argument('-l', '--lowercase', action='store_true', help='lowercase the words in the test file') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') args = parser.parse_args() # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings f = open(args.embeddings, encoding=args.encoding, errors='surrogateescape') words, matrix = embeddings.read(f, threshold=args.threshold, dtype=dtype) # Build word to index map word2ind = {word: i for i, word in enumerate(words)} # Length normalize embeddings embeddings.length_normalize(matrix) # Parse test file f = open(args.input, encoding=args.encoding, errors='surrogateescape') categories = [] src1 = [] trg1 = [] src2 = [] trg2 = [] for line in f: if line.startswith(': '): name = line[2:-1] is_syntactic = name.startswith('gram') categories.append({ 'name': name, 'is_syntactic': is_syntactic, 'total': 0, 'oov': 0 }) else: try: ind = [ word2ind[word.lower() if args.lowercase else word] for word in line.split() ] src1.append(ind[0]) trg1.append(ind[1]) src2.append(ind[2]) trg2.append(ind[3]) categories[-1]['total'] += 1 except KeyError: categories[-1]['oov'] += 1 total = len(src1) # Compute nearest neighbors using efficient matrix multiplication nn = [] for i in range(0, total, BATCH_SIZE): j = min(i + BATCH_SIZE, total) similarities = (matrix[src2[i:j]] - matrix[src1[i:j]] + matrix[trg1[i:j]]).dot(matrix.T) similarities[range(j - i), src1[i:j]] = -1 similarities[range(j - i), trg1[i:j]] = -1 similarities[range(j - i), src2[i:j]] = -1 nn += np.argmax(similarities, axis=1).tolist() nn = np.array(nn) # Compute and print accuracies semantic = {'correct': 0, 'total': 0, 'oov': 0} syntactic = {'correct': 0, 'total': 0, 'oov': 0} ind = 0 for category in categories: current = syntactic if category['is_syntactic'] else semantic correct = np.sum(nn[ind:ind + category['total']] == trg2[ind:ind + category['total']]) current['correct'] += correct current['total'] += category['total'] current['oov'] += category['oov'] ind += category['total'] if args.verbose: print('Coverage:{0:7.2%} Accuracy:{1:7.2%} | {2}'.format( category['total'] / (category['total'] + category['oov']), correct / category['total'], category['name'])) if args.verbose: print('-' * 80) print('Coverage:{0:7.2%} Accuracy:{1:7.2%} (sem:{2:7.2%}, syn:{3:7.2%})'. format((semantic['total'] + syntactic['total']) / (semantic['total'] + syntactic['total'] + semantic['oov'] + syntactic['oov']), (semantic['correct'] + syntactic['correct']) / (semantic['total'] + syntactic['total']), semantic['correct'] / semantic['total'], syntactic['correct'] / syntactic['total']))
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Map the source embeddings into the target embedding space') parser.add_argument('emb_file', help='the input target embeddings') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--max_vocab', default=0,type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary') parser.add_argument('--verbose', default=0,type=int, help='Verbose') mapping_group = parser.add_argument_group('mapping arguments', 'Basic embedding mapping arguments') mapping_group.add_argument('-dtrain_file', '--dictionary_train_file', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument('-dtest_file', '--dictionary_test_file', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') geomm_group = parser.add_argument_group('GeoMM Multi arguments', 'Arguments for GeoMM Multi method') geomm_group.add_argument('--l2_reg', type=float,default=1e3, help='Lambda for L2 Regularization') geomm_group.add_argument('--max_opt_time', type=int,default=5000, help='Maximum time limit for optimization in seconds') geomm_group.add_argument('--max_opt_iter', type=int,default=150, help='Maximum number of iterations for optimization') eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation') eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time') eval_group.add_argument('--eval_batch_size', type=int,default=1000, help='Batch size for evaluation') eval_group.add_argument('--csls_neighbourhood', type=int,default=10, help='Neighbourhood size for CSLS') args = parser.parse_args() BATCH_SIZE = args.eval_batch_size # Logging method_name = os.path.join('logs','geomm_multi') directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) if not os.path.exists(directory): os.makedirs(directory) log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train_file)) log_file_name = log_file_name + '.log' class Logger(object): def __init__(self): self.terminal = sys.stdout self.log = open(os.path.join(directory,log_file_name), "a") def write(self, message): self.terminal.write(message) self.log.write(message) def flush(self): #this flush method is needed for python 3 compatibility. #this handles the flush command by doing nothing. #you might want to specify some extra behavior here. pass sys.stdout = Logger() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading train data...') words = [] emb = [] with open(args.emb_file, encoding=args.encoding, errors='surrogateescape') as f: for line in f: srcfile = open(line.strip(), encoding=args.encoding, errors='surrogateescape') words_temp, x_temp = embeddings.read(srcfile,max_voc=args.max_vocab, dtype=dtype) words.append(words_temp) emb.append(x_temp) # Build word to index map word2ind = [] for lang in words: word2ind.append({word: i for i, word in enumerate(lang)}) # Build training dictionary train_pairs = [] with open(args.dictionary_train_file, encoding=args.encoding, errors='surrogateescape') as ff: for line in ff: vals = line.split(',') curr_dict=[int(vals[0].strip()),int(vals[1].strip())] src_indices = [] trg_indices = [] with open(vals[2].strip(), encoding=args.encoding, errors='surrogateescape') as f: for line in f: src,trg = line.split() if args.max_vocab: src=src.lower() trg=trg.lower() try: src_ind = word2ind[curr_dict[0]][src] trg_ind = word2ind[curr_dict[1]][trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) curr_dict.append(src_indices) curr_dict.append(trg_indices) train_pairs.append(curr_dict) if args.verbose: print('Normalizing embeddings...') # Step 0: Normalization for action in args.normalize: if action == 'unit': for i in range(len(emb)): emb[i] = embeddings.length_normalize(emb[i]) elif action == 'center': for i in range(len(emb)): emb[i] = embeddings.mean_center(emb[i]) elif action == 'unitdim': for i in range(len(emb)): emb[i] = embeddings.length_normalize_dimensionwise(emb[i]) elif action == 'centeremb': for i in range(len(emb)): emb[i] = embeddings.mean_center_embeddingwise(emb[i]) # Step 1: Optimization if args.verbose: print('Beginning Optimization') start_time = time.time() mean_size=0 for tp in range(len(train_pairs)): src_indices = train_pairs[tp][2] trg_indices = train_pairs[tp][3] x_count = len(set(src_indices)) z_count = len(set(trg_indices)) A = np.zeros((x_count,z_count)) # Creating dictionary matrix from training set map_dict_src={} map_dict_trg={} I=0 uniq_src=[] uniq_trg=[] for i in range(len(src_indices)): if src_indices[i] not in map_dict_src.keys(): map_dict_src[src_indices[i]]=I I+=1 uniq_src.append(src_indices[i]) J=0 for j in range(len(trg_indices)): if trg_indices[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices[j]]=J J+=1 uniq_trg.append(trg_indices[j]) for i in range(len(src_indices)): A[map_dict_src[src_indices[i]],map_dict_trg[trg_indices[i]]]=1 train_pairs[tp].append(uniq_src) train_pairs[tp].append(uniq_trg) train_pairs[tp].append(A) mean_size+= (len(uniq_src)*len(uniq_trg)) mean_size = mean_size/len(train_pairs) np.random.seed(0) Lambda=args.l2_reg variables=[] manif = [] low_rank=emb[0].shape[1] for i in range(len(emb)): variables.append(TT.matrix()) manif.append(Stiefel(emb[i].shape[1],low_rank)) variables.append(TT.matrix()) manif.append(PositiveDefinite(low_rank)) B = variables[-1] cost = 0.5*Lambda*(TT.sum(B**2)) for i in range(len(train_pairs)): x = emb[train_pairs[i][0]] z = emb[train_pairs[i][1]] U1 = variables[train_pairs[i][0]] U2 = variables[train_pairs[i][1]] cost = cost + TT.sum(((shared(x[train_pairs[i][4]]).dot(U1.dot(B.dot(U2.T)))).dot(shared(z[train_pairs[i][5]]).T)-shared(train_pairs[i][6]))**2)/float(len(train_pairs[i][2])) solver = ConjugateGradient(maxtime=args.max_opt_time,maxiter=args.max_opt_iter,mingradnorm=1e-12) manifold =Product(manif) problem = Problem(manifold=manifold, cost=cost, arg=variables, verbosity=3) wopt = solver.solve(problem) w= wopt U1 = w[0] U2 = w[1] B = w[2] # Step 2: Transformation Bhalf = scipy.linalg.sqrtm(wopt[-1]) test_emb = [] for i in range(len(emb)): test_emb.append(emb[i].dot(wopt[i]).dot(Bhalf)) end_time = time.time() if args.verbose: print('Completed training in {0:.2f} seconds'.format(end_time-start_time)) gc.collect() # Step 3: Evaluation if args.verbose: print('Beginning Evaluation') if args.normalize_eval: for i in range(len(test_emb)): test_emb[i] = embeddings.length_normalize(test_emb[i]) # Loading test dictionary with open(args.dictionary_test_file, encoding=args.encoding, errors='surrogateescape') as ff: for line in ff: vals = line.split(',') curr_dict=[int(vals[0].strip()),int(vals[1].strip())] with open(vals[2].strip(), encoding=args.encoding, errors='surrogateescape') as f: src_word2ind = word2ind[curr_dict[0]] trg_word2ind = word2ind[curr_dict[1]] xw = test_emb[curr_dict[0]] zw = test_emb[curr_dict[1]] src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src=src.lower() trg=trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) t=time.time() nbrhood_x=np.zeros(xw.shape[0]) nbrhood_z=np.zeros(zw.shape[0]) nbrhood_z2=cp.zeros(zw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_x = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1) nbrhood_x[src[i:j]]=np.mean(similarities_x[:,:args.csls_neighbourhood],axis=1) batch_num=1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1*cp.partition(-1*cp.dot(cp.asarray(zw[i:j]),cp.transpose(cp.asarray(xw))),args.csls_neighbourhood-1 ,axis=1)[:,:args.csls_neighbourhood] nbrhood_z2[i:j]=(cp.mean(similarities[:,:args.csls_neighbourhood],axis=1)) batch_num+=1 nbrhood_z=cp.asnumpy(nbrhood_z2) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]])- nbrhood_z nn = similarities.argmax(axis=1).tolist() similarities = np.argsort((similarities),axis=1) nn5 = (similarities[:,-5:]) nn10 = (similarities[:,-10:]) for k in range(j-i): translation[src[i+k]] = nn[k] translation5[src[i+k]] = nn5[k] translation10[src[i+k]] = nn10[k] accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean=0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean+=1 break mean/=len(src) accuracy5 = mean mean=0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean+=1 break mean/=len(src) accuracy10 = mean print('Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}'.format(coverage, accuracy, accuracy5, accuracy10))
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description= 'Evaluate embeddings of two languages in a shared space in word translation induction' ) parser.add_argument('src_embeddings', help='the source language embeddings') parser.add_argument('trg_embeddings', help='the target language embeddings') parser.add_argument('-d', '--dictionary', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') parser.add_argument( '--encoding', default='utf-8', action='store_true', help='the character encoding for input/output (defaults to utf-8)') args = parser.parse_args() # Read input embeddings srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape') src_words, src_matrix = embeddings.read(srcfile) trg_words, trg_matrix = embeddings.read(trgfile) # Length normalize embeddings so their dot product effectively computes the cosine similarity src_matrix = embeddings.length_normalize(src_matrix) trg_matrix = embeddings.length_normalize(trg_matrix) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Read dictionary and compute coverage f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) # Compute accuracy correct = 0 src, trg = zip(*src2trg.items()) for i in range(0, len(src2trg), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src2trg)) similarities = src_matrix[list(src[i:j])].dot(trg_matrix.T) nn = np.argmax(similarities, axis=1).tolist() for k in range(j - i): if nn[k] in trg[i + k]: correct += 1 print('Coverage:{0:7.2%} Accuracy:{1:7.2%}'.format( coverage, correct / len(src2trg)))
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Evaluate embeddings in word analogy') parser.add_argument('--src_embeddings', help='the word embeddings for source (left side)') parser.add_argument('--trg_embeddings', help='the word embeddings for target (right side)') parser.add_argument('-t', '--threshold', type=int, default=0, help='reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30,000)') parser.add_argument('-i', '--input', default=sys.stdin.fileno(), help='the test file (defaults to stdin)') parser.add_argument('-v', '--verbose', action='store_true', help='verbose output (give category specific results)') parser.add_argument('-l1', '--src_lowercase', action='store_true', help='lowercase the words in the test file') parser.add_argument('-l2', '--trg_lowercase', action='store_true', help='lowercase the words in the test file') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') args = parser.parse_args() # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings f = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape') src_words, src_matrix = embeddings.read(f, threshold=args.threshold, dtype=dtype) f.close() f = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape') trg_words, trg_matrix = embeddings.read(f, threshold=args.threshold, dtype=dtype) f.close() # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} src_ind2word = {i: word for i, word in enumerate(src_words)} trg_ind2word = {i: word for i, word in enumerate(trg_words)} # Length normalize embeddings embeddings.length_normalize(src_matrix) embeddings.length_normalize(trg_matrix) # Parse test file # c-a+b ~ d f = open(args.input, encoding=args.encoding, errors='surrogateescape') categories = [] a = [] #src lang b = [] #src lang c = [] #trg lang d = [] #trg lang linecounter = 0 for line in f: if line.startswith(': '): name = line[2:-1] is_syntactic = name.startswith('gram') categories.append({'name': name, 'is_syntactic': is_syntactic, 'total': 0, 'oov': 0}) else: try: words = line.split() #ind = [word2ind[word.lower() if args.lowercase else word] for word in line.split()] w0 = src_word2ind[words[0].lower() if args.src_lowercase else words[0]] w1 = src_word2ind[words[1].lower() if args.src_lowercase else words[1]] w2 = trg_word2ind[words[2].lower() if args.trg_lowercase else words[2]] w3 = trg_word2ind[words[3].lower() if args.trg_lowercase else words[3]] a.append(w0) b.append(w1) c.append(w2) d.append(w3) categories[-1]['total'] += 1 except KeyError: categories[-1]['oov'] += 1 total = len(a) # Compute nearest neighbors using efficient matrix multiplication nn = [] for i in range(0, total, BATCH_SIZE): j = min(i + BATCH_SIZE, total) similarities = (trg_matrix[c[i:j]] - src_matrix[a[i:j]] + src_matrix[b[i:j]]).dot(trg_matrix.T) similarities[range(j-i), a[i:j]] = -1 similarities[range(j-i), b[i:j]] = -1 similarities[range(j-i), c[i:j]] = -1 nn += np.argmax(similarities, axis=1).tolist() nn = np.array(nn) # Compute and print accuracies semantic = {'correct': 0, 'total': 0, 'oov': 0} syntactic = {'correct': 0, 'total': 0, 'oov': 0} ind = 0 with open('crosslingual_predict.txt', 'w') as outfile: for i in range(len(nn)): outfile.write(src_ind2word[a[i]]+' '+src_ind2word[b[i]]+' '+trg_ind2word[c[i]]+' '+trg_ind2word[d[i]]+' | '+trg_ind2word[nn[i]]+'\n') for category in categories: current = syntactic if category['is_syntactic'] else semantic correct = np.sum(nn[ind:ind+category['total']] == d[ind:ind+category['total']]) current['correct'] += correct current['total'] += category['total'] current['oov'] += category['oov'] ind += category['total'] if args.verbose: print('Coverage:{0:7.2%} Accuracy:{1:7.2%} | {2}'.format( category['total'] / (category['total'] + category['oov']), correct / category['total'], category['name'])) if args.verbose: print('-'*80) print('Coverage:{0:7.2%} Accuracy:{1:7.2%} (sem:{2:7.2%}, syn:{3:7.2%})'.format( (semantic['total'] + syntactic['total']) / (semantic['total'] + syntactic['total'] + semantic['oov'] + syntactic['oov']), (semantic['correct'] + syntactic['correct']) / (semantic['total'] + syntactic['total']), semantic['correct'] / semantic['total'], syntactic['correct'] / syntactic['total']))