@site: http://muyeby.github.io @software: PyCharm @file: GetTrainVec.py.py @time: 17-12-18 上午10:11 """ import numpy as np import embeddings import sys from sklearn import preprocessing if __name__ == "__main__": source_file = open(sys.argv[1], encoding='utf-8', errors='surrogateescape') target_file = open(sys.argv[2], encoding='utf-8', errors='surrogateescape') en_words, en_vec = embeddings.read(source_file) de_words, de_vec = embeddings.read(target_file) src_word2ind = {word: i for i, word in enumerate(en_words)} trg_word2ind = {word: i for i, word in enumerate(de_words)} src_indices = [] trg_indices = [] src_words = [] trg_words = [] f = open(sys.argv[3], encoding='utf-8', errors='surrogateescape') for line in f: src, trg = line.split() try: src_words.append(src)
def evaluate(src_emb_fname, tgt_emb_fname, dict_fname, max_voc=0, retrieval_method="csls", csls_k=10, batch_size=2500): print('Loading train data...') srcfile = open(src_emb_fname, 'r', encoding='utf-8', errors='surrogateescape') tgtfile = open(tgt_emb_fname, 'r', encoding='utf-8', errors='surrogateescape') # Read source embeddings src_words, x = embeddings.read(srcfile, max_voc=max_voc, dtype='float32') src_word2ind = {word: i for i, word in enumerate(src_words)} # Read target embeddings tgt_words, z = embeddings.read(tgtfile, max_voc=max_voc, dtype='float32') tgt_word2ind = {word: i for i, word in enumerate(tgt_words)} srcfile.close() tgtfile.close() xw = embeddings.length_normalize(x) zw = embeddings.length_normalize(z) # Loading test dictionary f = open(dict_fname, encoding='utf-8', errors='surrogateescape') src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() ### get translations translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) if retrieval_method == 'nn': # Standard nearest neighbor for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) nn = similarities.argmax(axis=1).tolist() similarities_idx = similarities.argsort(axis=1) nn5 = similarities_idx[:, -5:] nn10 = similarities_idx[:, -10:] for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] elif retrieval_method == 'csls': t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) print('Computing X Neighbourhood') # batch_size=1000 for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) # similarities_x = np.sort(similarities, axis=1) similarities_x = -1 * np.partition( -1 * similarities, csls_k - 1, axis=1) #similarities_x = -1*cp.partition(-1*cp.dot(cp.asarray(xw[src[i:j]]),cp.transpose(cp.asarray(zw))),csls_k-1 ,axis=1)[:,:csls_k] nbrhood_x[src[i:j]] = np.mean(similarities_x[:, :csls_k], axis=1) print('Completed in {0} seconds'.format(time.time() - t)) print('Computing Z Neighbourhood') batch_num = 1 for i in range(0, zw.shape[0], batch_size): j = min(i + batch_size, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), csls_k - 1, axis=1)[:, :csls_k] nbrhood_z2[i:j] = (cp.mean(similarities[:, :csls_k], axis=1)) print('Completed batch {0} in {1}'.format(batch_num, time.time() - t)) batch_num += 1 # gc.collect() # t=time.time() nbrhood_z = cp.asnumpy(nbrhood_z2) # ipdb.set_trace() print(time.time() - t) csls_alpha = 1 for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - csls_alpha * nbrhood_x[src[i:j]]) - csls_alpha * nbrhood_z nn = similarities.argmax(axis=1).tolist() print(time.time() - t) similarities = np.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] print('Completed in {0} seconds'.format(time.time() - t)) #### write the translations (1 pair per line format) #with open(trans_tgt_fname, 'w', encoding='utf-8', errors='surrogateescape') as trans_tgt_file: # for w in trans_words: # trans='' # if w in src_word2ind: # trans=tgt_words[translation[src_word2ind[w]]] # trans_tgt_file.write('{}\t{}\n'.format(w,trans)) # evaluation metrics accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean = 0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy5 = mean mean = 0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy10 = mean print( 'Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}' .format(coverage, accuracy, accuracy5, accuracy10))
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Evaluate embeddings of two languages in a shared space in word translation induction') parser.add_argument('src_embeddings', help='the source language embeddings') parser.add_argument('trg_embeddings', help='the target language embeddings') parser.add_argument('-d', '--dictionary', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') parser.add_argument('--retrieval', default='nn', choices=['nn', 'topk', 'invnn', 'invsoftmax', 'csls'], help='the retrieval method (nn: standard nearest neighbor; invnn: inverted nearest neighbor; invsoftmax: inverted softmax; csls: cross-domain similarity local scaling)') parser.add_argument('--inv_temperature', default=1, type=float, help='the inverse temperature (only compatible with inverted softmax)') parser.add_argument('--inv_sample', default=None, type=int, help='use a random subset of the source vocabulary for the inverse computations (only compatible with inverted softmax)') parser.add_argument('-k', '--neighborhood', default=10, type=int, help='the neighborhood size (only compatible with csls)') parser.add_argument('--dot', action='store_true', help='use the dot product in the similarity computations instead of the cosine') # parser.add_argument('--mean', action='store_true', help='Mean center the target.') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--seed', type=int, default=0, help='the random seed') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') args = parser.parse_args() # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # KNN neighborhood for MRR. knn = args.neighborhood # Read input embeddings srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np xp.random.seed(args.seed) # Length normalize embeddings so their dot product effectively computes the cosine similarity if not args.dot: embeddings.length_normalize(x) embeddings.length_normalize(z) # if args.mean: # print(args.mean) # print("Mean Center....") # embeddings.mean_center(x) # embeddings.mean_center(z) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Read dictionary and compute coverage f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) count = 0 oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) # Find translations translation = collections.defaultdict(list) if args.retrieval == 'nn': # Standard nearest neighbor for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = x[src[i:j]].dot(z.T) nn = similarities.argmax(axis=1).tolist() for k in range(j-i): translation[src[i+k]].append(nn[k]) elif args.retrieval == 'invnn': # Inverted nearest neighbor best_rank = np.full(len(src), x.shape[0], dtype=int) best_sim = np.full(len(src), -100, dtype=dtype) for i in range(0, z.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, z.shape[0]) similarities = z[i:j].dot(x.T) ind = (-similarities).argsort(axis=1) ranks = asnumpy(ind.argsort(axis=1)[:, src]) sims = asnumpy(similarities[:, src]) for k in range(i, j): for l in range(len(src)): rank = ranks[k-i, l] sim = sims[k-i, l] if rank < best_rank[l] or (rank == best_rank[l] and sim > best_sim[l]): best_rank[l] = rank best_sim[l] = sim translation[src[l]].append(nn[k]) # Added by Ashwinkumar Ganesan. elif args.retrieval == 'topk': # Calculate mean reciprocal ranking. for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = x[src[i:j]].dot(z.T) nn = similarities.argsort(axis=1).tolist() for k in range(j-i): translation[src[i+k]] = nn[k][-knn: ] # Continue as usual. elif args.retrieval == 'invsoftmax': # Inverted softmax sample = xp.arange(x.shape[0]) if args.inv_sample is None else xp.random.randint(0, x.shape[0], args.inv_sample) partition = xp.zeros(z.shape[0]) for i in range(0, len(sample), BATCH_SIZE): j = min(i + BATCH_SIZE, len(sample)) partition += xp.exp(args.inv_temperature*z.dot(x[sample[i:j]].T)).sum(axis=1) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) p = xp.exp(args.inv_temperature*x[src[i:j]].dot(z.T)) / partition nn = p.argmax(axis=1).tolist() for k in range(j-i): translation[src[i+k]].append(nn[k]) elif args.retrieval == 'csls': # Cross-domain similarity local scaling knn_sim_bwd = xp.zeros(z.shape[0]) for i in range(0, z.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, z.shape[0]) knn_sim_bwd[i:j] = topk_mean(z[i:j].dot(x.T), k=args.neighborhood, inplace=True) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = 2*x[src[i:j]].dot(z.T) - knn_sim_bwd # Equivalent to the real CSLS scores for NN nn = similarities.argmax(axis=1).tolist() for k in range(j-i): translation[src[i+k]].append(nn[k]) # Compute accuracy accuracy = np.mean([1 if len(set(translation[i]) & set(src2trg[i])) > 0 else 0 for i in src]) print('KNN: {0:} Coverage:{1:7.2%} Accuracy:{2:7.2%}'.format(knn, coverage, np.mean(accuracy)))
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Evaluate embeddings in word similarity/relatedness') parser.add_argument('src_embeddings', help='the source language embeddings') parser.add_argument('trg_embeddings', nargs='?', help='the target language embeddings') parser.add_argument('-i', '--input', default=[sys.stdin.fileno()], nargs='+', help='the input datasets (defaults to stdin)') parser.add_argument('-l', '--lowercase', action='store_true', help='lowercase the words in the test files') parser.add_argument('--backoff', default=None, type=float, help='use a backoff similarity score for OOV entries') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--sim', nargs='*', help='the names of the datasets to include in the similarity results') parser.add_argument('--rel', nargs='*', help='the names of the datasets to include in the relatedness results') parser.add_argument('--all', nargs='*', help='the names of the datasets to include in the total results') args = parser.parse_args() # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Parse test files word_pairs = [] golds = [] for filename in args.input: f = open(filename, encoding=args.encoding, errors='surrogateescape') word_pairs.append([]) golds.append([]) for line in f: if args.lowercase: line = line.lower() src, trg, score = line.split('\t') word_pairs[-1].append((src, trg)) # shape like [(src, trg), (src, trg), ... ] golds[-1].append(float(score)) # shape like [score, score, score ...] # Build vocabularies src_vocab = {pair[0] for pairs in word_pairs for pair in pairs} trg_vocab = {pair[1] for pairs in word_pairs for pair in pairs} # Read embeddings srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.src_embeddings if args.trg_embeddings is None else args.trg_embeddings, encoding=args.encoding, errors='surrogateescape') src_words, src_matrix = embeddings.read(srcfile, vocabulary=src_vocab, dtype=dtype) trg_words, trg_matrix = embeddings.read(trgfile, vocabulary=trg_vocab, dtype=dtype) # Length normalize embeddings so their dot product effectively computes the cosine similarity embeddings.length_normalize(src_matrix) embeddings.length_normalize(trg_matrix) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Compute system scores and correlations results = [] for i in range(len(golds)): system = [] gold = [] oov = 0 for gold_score, (src, trg) in zip(golds[i], word_pairs[i]): try: cos = np.dot(src_matrix[src_word2ind[src]], trg_matrix[trg_word2ind[trg]]) system.append(cos) gold.append(gold_score) except KeyError: if args.backoff is None: oov += 1 else: system.append(args.backoff) gold.append(gold_score) name = os.path.splitext(os.path.basename(args.input[i]))[0] #os.path.splitext : split the path name path into a pair (root, ext) #for example : 'en-de.txt' --> 'en-de', '.txt' #os.path.basename : return the base name of path name path #for example : './data/en-de.txt' --> 'en-de.txt' coverage = len(system) / (len(system) + oov) pearson = scipy.stats.pearsonr(gold, system)[0] # Calculate a Pearson correlation coefficient and the p-value for testing non-correlation. spearman = scipy.stats.spearmanr(gold, system)[0] # Calculate a Spearman rank-order correlation coefficient and the p-value to test for non-correlation. results.append((name, coverage, pearson, spearman)) print('Coverage:{0:7.2%} Pearson:{1:7.2%} Spearman:{2:7.2%} | {3}'.format(coverage, pearson, spearman, name)) # Compute and print total (averaged) results # if there're multi-input testfile if len(results) > 1: print('-'*80) if args.sim is not None: sim = list(zip(*[res for res in results if res[0] in args.sim])) print('Coverage:{0:7.2%} Pearson:{1:7.2%} Spearman:{2:7.2%} | sim.'.format(np.mean(sim[1]), np.mean(sim[2]), np.mean(sim[3]))) if args.rel is not None:i rel = list(zip(*[res for res in results if res[0] in args.rel])) print('Coverage:{0:7.2%} Pearson:{1:7.2%} Spearman:{2:7.2%} | rel.'.format(np.mean(rel[1]), np.mean(rel[2]), np.mean(rel[3]))) if args.all is not None: results = [res for res in results if res[0] in args.all] results = list(zip(*results)) # zip(*result) : unzip print('Coverage:{0:7.2%} Pearson:{1:7.2%} Spearman:{2:7.2%} | all'.format(np.mean(results[1]), np.mean(results[2]), np.mean(results[3])))
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Evaluate embeddings in word analogy') parser.add_argument('embeddings', help='the word embeddings') parser.add_argument( '-t', '--threshold', type=int, default=0, help= 'reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30,000)' ) parser.add_argument('-i', '--input', default=sys.stdin.fileno(), help='the test file (defaults to stdin)') parser.add_argument('-v', '--verbose', action='store_true', help='verbose output (give category specific results)') parser.add_argument('-l', '--lowercase', action='store_true', help='lowercase the words in the test file') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp64', help='the floating-point precision (defaults to fp64)') args = parser.parse_args() # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings f = open(args.embeddings, encoding=args.encoding, errors='surrogateescape') words, matrix = embeddings.read(f, threshold=args.threshold, dtype=dtype) # Build word to index map word2ind = {word: i for i, word in enumerate(words)} # Length normalize embeddings matrix = embeddings.length_normalize(matrix) # Parse test file f = open(args.input, encoding=args.encoding, errors='surrogateescape') categories = [] src1 = [] trg1 = [] src2 = [] trg2 = [] for line in f: if line.startswith(': '): name = line[2:-1] is_syntactic = name.startswith('gram') categories.append({ 'name': name, 'is_syntactic': is_syntactic, 'total': 0, 'oov': 0 }) else: try: ind = [ word2ind[word.lower() if args.lowercase else word] for word in line.split() ] src1.append(ind[0]) trg1.append(ind[1]) src2.append(ind[2]) trg2.append(ind[3]) categories[-1]['total'] += 1 except KeyError: categories[-1]['oov'] += 1 total = len(src1) # Compute nearest neighbors using efficient matrix multiplication nn = [] for i in range(0, total, BATCH_SIZE): j = min(i + BATCH_SIZE, total) similarities = (matrix[src2[i:j]] - matrix[src1[i:j]] + matrix[trg1[i:j]]).dot(matrix.T) similarities[range(j - i), src1[i:j]] = -1 similarities[range(j - i), trg1[i:j]] = -1 similarities[range(j - i), src2[i:j]] = -1 nn += np.argmax(similarities, axis=1).tolist() nn = np.array(nn) # Compute and print accuracies semantic = {'correct': 0, 'total': 0, 'oov': 0} syntactic = {'correct': 0, 'total': 0, 'oov': 0} ind = 0 for category in categories: current = syntactic if category['is_syntactic'] else semantic correct = np.sum(nn[ind:ind + category['total']] == trg2[ind:ind + category['total']]) current['correct'] += correct current['total'] += category['total'] current['oov'] += category['oov'] ind += category['total'] if args.verbose: print('Coverage:{0:7.2%} Accuracy:{1:7.2%} | {2}'.format( category['total'] / (category['total'] + category['oov']), correct / category['total'], category['name'])) if args.verbose: print('-' * 80) print('Coverage:{0:7.2%} Accuracy:{1:7.2%} (sem:{2:7.2%}, syn:{3:7.2%})'. format((semantic['total'] + syntactic['total']) / (semantic['total'] + syntactic['total'] + semantic['oov'] + syntactic['oov']), (semantic['correct'] + syntactic['correct']) / (semantic['total'] + syntactic['total']), semantic['correct'] / semantic['total'], syntactic['correct'] / syntactic['total']))
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('--model_path', default=None, type=str, help='directory to save the model') parser.add_argument( '--geomm_embeddings_path', default=None, type=str, help= 'directory to save the output GeoMM latent space embeddings. The output embeddings are normalized.' ) parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument( '--max_vocab', default=0, type=int, help='Maximum vocabulary to be loaded, 0 allows complete vocabulary') parser.add_argument('--verbose', default=0, type=int, help='Verbose') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments') mapping_group.add_argument( '-dtrain', '--dictionary_train', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument( '-dtest', '--dictionary_test', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') geomm_group = parser.add_argument_group('GeoMM arguments', 'Arguments for GeoMM method') geomm_group.add_argument('--l2_reg', type=float, default=1e2, help='Lambda for L2 Regularization') geomm_group.add_argument( '--max_opt_time', type=int, default=5000, help='Maximum time limit for optimization in seconds') geomm_group.add_argument( '--max_opt_iter', type=int, default=150, help='Maximum number of iterations for optimization') eval_group = parser.add_argument_group('evaluation arguments', 'Arguments for evaluation') eval_group.add_argument('--normalize_eval', action='store_true', help='Normalize the embeddings at test time') eval_group.add_argument('--eval_batch_size', type=int, default=1000, help='Batch size for evaluation') eval_group.add_argument('--csls_neighbourhood', type=int, default=10, help='Neighbourhood size for CSLS') args = parser.parse_args() BATCH_SIZE = args.eval_batch_size ## Logging #method_name = os.path.join('logs','geomm') #directory = os.path.join(os.path.join(os.getcwd(),method_name), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) #if not os.path.exists(directory): # os.makedirs(directory) #log_file_name, file_extension = os.path.splitext(os.path.basename(args.dictionary_train)) #log_file_name = log_file_name + '.log' #class Logger(object): # def __init__(self): # self.terminal = sys.stdout # self.log = open(os.path.join(directory,log_file_name), "a") # def write(self, message): # self.terminal.write(message) # self.log.write(message) # def flush(self): # #this flush method is needed for python 3 compatibility. # #this handles the flush command by doing nothing. # #you might want to specify some extra behavior here. # pass #sys.stdout = Logger() if args.verbose: print('Current arguments: {0}'.format(args)) dtype = 'float32' if args.verbose: print('Loading train data...') # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, max_voc=args.max_vocab, dtype=dtype) trg_words, z = embeddings.read(trgfile, max_voc=args.max_vocab, dtype=dtype) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary noov = 0 src_indices = [] trg_indices = [] f = open(args.dictionary_train, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: noov += 1 if args.verbose: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg)) #, file=sys.stderr f.close() if args.verbose: print('Number of training pairs having at least one OOV: {}'.format( noov)) src_indices = src_indices trg_indices = trg_indices if args.verbose: print('Normalizing embeddings...') # STEP 0: Normalization for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) # Step 1: Optimization if args.verbose: print('Beginning Optimization') start_time = time.time() x_count = len(set(src_indices)) z_count = len(set(trg_indices)) A = np.zeros((x_count, z_count)) # Creating dictionary matrix from training set map_dict_src = {} map_dict_trg = {} I = 0 uniq_src = [] uniq_trg = [] for i in range(len(src_indices)): if src_indices[i] not in map_dict_src.keys(): map_dict_src[src_indices[i]] = I I += 1 uniq_src.append(src_indices[i]) J = 0 for j in range(len(trg_indices)): if trg_indices[j] not in map_dict_trg.keys(): map_dict_trg[trg_indices[j]] = J J += 1 uniq_trg.append(trg_indices[j]) for i in range(len(src_indices)): A[map_dict_src[src_indices[i]], map_dict_trg[trg_indices[i]]] = 1 np.random.seed(0) Lambda = args.l2_reg U1 = TT.matrix() U2 = TT.matrix() B = TT.matrix() cost = TT.sum(((shared(x[uniq_src]).dot(U1.dot(B.dot(U2.T)))).dot( shared(z[uniq_trg]).T) - A)**2) + 0.5 * Lambda * (TT.sum(B**2)) solver = ConjugateGradient(maxtime=args.max_opt_time, maxiter=args.max_opt_iter) manifold = Product([ Stiefel(x.shape[1], x.shape[1]), Stiefel(z.shape[1], x.shape[1]), PositiveDefinite(x.shape[1]) ]) #manifold =Product([Stiefel(x.shape[1], 200),Stiefel(z.shape[1], 200),PositiveDefinite(200)]) problem = Problem(manifold=manifold, cost=cost, arg=[U1, U2, B], verbosity=3) wopt = solver.solve(problem) w = wopt U1 = w[0] U2 = w[1] B = w[2] ### Save the models if requested if args.model_path is not None: os.makedirs(args.model_path, exist_ok=True) np.savetxt('{}/U_src.csv'.format(args.model_path), U1) np.savetxt('{}/U_tgt.csv'.format(args.model_path), U2) np.savetxt('{}/B.csv'.format(args.model_path), B) # Step 2: Transformation xw = x.dot(U1).dot(scipy.linalg.sqrtm(B)) zw = z.dot(U2).dot(scipy.linalg.sqrtm(B)) end_time = time.time() if args.verbose: print('Completed training in {0:.2f} seconds'.format(end_time - start_time)) gc.collect() ### Save the GeoMM embeddings if requested xw_n = embeddings.length_normalize(xw) zw_n = embeddings.length_normalize(zw) if args.geomm_embeddings_path is not None: os.makedirs(args.geomm_embeddings_path, exist_ok=True) out_emb_fname = os.path.join(args.geomm_embeddings_path, 'src.vec') with open(out_emb_fname, 'w', encoding=args.encoding) as outfile: embeddings.write(src_words, xw_n, outfile) out_emb_fname = os.path.join(args.geomm_embeddings_path, 'trg.vec') with open(out_emb_fname, 'w', encoding=args.encoding) as outfile: embeddings.write(trg_words, zw_n, outfile) # Step 3: Evaluation if args.normalize_eval: xw = xw_n zw = zw_n X = xw[src_indices] Z = zw[trg_indices] # Loading test dictionary f = open(args.dictionary_test, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) trg2src = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() if args.max_vocab: src = src.lower() trg = trg.lower() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) trg2src[trg_ind].add(src_ind) vocab.add(src) except KeyError: oov.add(src) src = list(src2trg.keys()) trgt = list(trg2src.keys()) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) f.close() translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) ### compute nearest neigbours of x in z t = time.time() nbrhood_x = np.zeros(xw.shape[0]) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities_x = -1 * np.partition( -1 * similarities, args.csls_neighbourhood - 1, axis=1) nbrhood_x[src[i:j]] = np.mean( similarities_x[:, :args.csls_neighbourhood], axis=1) ### compute nearest neigbours of z in x (GPU version) nbrhood_z = np.zeros(zw.shape[0]) with cp.cuda.Device(0): nbrhood_z2 = cp.zeros(zw.shape[0]) batch_num = 1 for i in range(0, zw.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), args.csls_neighbourhood - 1, axis=1)[:, :args.csls_neighbourhood] nbrhood_z2[i:j] = (cp.mean( similarities[:, :args.csls_neighbourhood], axis=1)) batch_num += 1 nbrhood_z = cp.asnumpy(nbrhood_z2) #### compute nearest neigbours of z in x (CPU version) #nbrhood_z=np.zeros(zw.shape[0]) #for i in range(0, len(zw.shape[0]), BATCH_SIZE): # j = min(i + BATCH_SIZE, len(zw.shape[0])) # similarities = zw[i:j].dot(xw.T) # similarities_z = -1*np.partition(-1*similarities,args.csls_neighbourhood-1 ,axis=1) # nbrhood_z[i:j]=np.mean(similarities_z[:,:args.csls_neighbourhood],axis=1) #### find translation #for i in range(0, len(src), BATCH_SIZE): # j = min(i + BATCH_SIZE, len(src)) # similarities = xw[src[i:j]].dot(zw.T) # similarities = np.transpose(np.transpose(2*similarities) - nbrhood_x[src[i:j]]) - nbrhood_z # nn = similarities.argmax(axis=1).tolist() # similarities = np.argsort((similarities),axis=1) # nn5 = (similarities[:,-5:]) # nn10 = (similarities[:,-10:]) # for k in range(j-i): # translation[src[i+k]] = nn[k] # translation5[src[i+k]] = nn5[k] # translation10[src[i+k]] = nn10[k] #if args.geomm_embeddings_path is not None: # delim=',' # os.makedirs(args.geomm_embeddings_path,exist_ok=True) # translations_fname=os.path.join(args.geomm_embeddings_path,'translations.csv') # with open(translations_fname,'w',encoding=args.encoding) as translations_file: # for src_id in src: # src_word = src_words[src_id] # all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ] # trgout_words = [ trg_words[j] for j in translation10[src_id] ] # ss = list(nn10[src_id,:]) # # p1 = ':'.join(all_trg_words) # p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] ) # translations_file.write( '{s}{delim}{p1}{delim}{p2}\n'.format(s=src_word, delim=delim, p1=p1, p2=p2) ) ### find translation (and write to file if output requested) delim = ',' translations_file = None if args.geomm_embeddings_path is not None: os.makedirs(args.geomm_embeddings_path, exist_ok=True) translations_fname = os.path.join(args.geomm_embeddings_path, 'translations.csv') translations_file = open(translations_fname, 'w', encoding=args.encoding) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - nbrhood_x[src[i:j]]) - nbrhood_z nn = similarities.argmax(axis=1).tolist() similarities = np.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] if args.geomm_embeddings_path is not None: src_id = src[i + k] src_word = src_words[src_id] all_trg_words = [ trg_words[trg_id] for trg_id in src2trg[src_id] ] trgout_words = [trg_words[j] for j in translation10[src_id]] #ss = list(nn10[src_id,:]) p1 = ':'.join(all_trg_words) p2 = ':'.join(trgout_words) #p2 = delim.join( [ '{}{}{}'.format(w,delim,s) for w,s in zip(trgout_words,ss) ] ) translations_file.write('{s}{delim}{p1}{delim}{p2}\n'.format( s=src_word, p1=p1, p2=p2, delim=delim)) if args.geomm_embeddings_path is not None: translations_file.close() accuracy = np.mean([1 if translation[i] in src2trg[i] else 0 for i in src]) mean = 0 for i in src: for k in translation5[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy5 = mean mean = 0 for i in src: for k in translation10[i]: if k in src2trg[i]: mean += 1 break mean /= len(src) accuracy10 = mean print( 'Coverage:{0:7.2%} Accuracy:{1:7.2%} Accuracy(Top 5):{2:7.2%} Accuracy(Top 10):{3:7.2%}' .format(coverage, accuracy, accuracy5, accuracy10))
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)') mapping_group.add_argument( '-d', '--dictionary', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument( '-c', '--orthogonal', dest='orthogonal', action='store_true', help='use orthogonal constrained mapping (default)') mapping_group.add_argument('-u', '--unconstrained', dest='orthogonal', action='store_false', help='use unconstrained mapping') parser.set_defaults(orthogonal=True) self_learning_group = parser.add_argument_group( 'self-learning arguments', 'Optional arguments for self-learning (ACL 2017)') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument( '--direction', choices=['forward', 'backward', 'union'], default='forward', help='the direction for dictionary induction (defaults to forward)') self_learning_group.add_argument( '--numerals', action='store_true', help= 'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary' ) self_learning_group.add_argument( '--orthographic_ext', default=0, type=float, help= 'augment embeddings with character n-gram counts; provide inverse scale constant as argument' ) self_learning_group.add_argument( '--orthographic_ext_n', default=1, type=int, help='n for character n-grams in orthograhpic_ext option') self_learning_group.add_argument( '--orthographic_sim', default=0, type=float, help= 'use edit distance when calculating similarity; provide inverse scale constant as argument' ) self_learning_group.add_argument( '--orthographic_sim_k', default=1, type=int, help= 'k to use for symmetric delete heuristic for limiting edit distance calculations' ) self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--validation', default=None, help='a dictionary file for validation at each iteration') self_learning_group.add_argument( '--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words = None x = None trg_words = None z = None if args.orthographic_ext: (src_words, x), (trg_words, z) = embeddings.orthoread(srcfile, trgfile, args.orthographic_ext, args.orthographic_ext_n) else: src_words, x = embeddings.read(srcfile) trg_words, z = embeddings.read(trgfile) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary src_indices = [] trg_indices = [] if args.numerals: if args.dictionary != sys.stdin.fileno(): print('WARNING: Using numerals instead of the training dictionary', file=sys.stderr) numeral_regex = re.compile('^[0-9]+$') src_numerals = { word for word in src_words if numeral_regex.match(word) is not None } trg_numerals = { word for word in trg_words if numeral_regex.match(word) is not None } numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: pass oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Normalize embeddings for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) # Training loop prev_objective = objective = -100. it = 1 ortho_sim = None ortho_sim_scale = args.orthographic_sim if args.orthographic_sim: s = ntpath.basename(args.src_input)[0:2] t = ntpath.basename(args.trg_input)[0:2] k = args.orthographic_sim_k ortho_sim = ortho.loadOrCreateSimilarityMatrix(s, t, k) t = time.time() while it == 1 or objective - prev_objective >= args.threshold: # Update the embedding mapping if args.orthogonal: # orthogonal mapping u, s, vt = np.linalg.svd(np.dot(z[trg_indices].T, x[src_indices])) w = np.dot(vt.T, u.T) else: # unconstrained mapping x_pseudoinv = np.dot( np.linalg.inv(np.dot(x[src_indices].T, x[src_indices])), x[src_indices].T) w = np.dot(x_pseudoinv, z[trg_indices]) xw = x.dot(w) # Self-learning if args.self_learning: # Update the training dictionary best_sim_forward = np.full(x.shape[0], -100.) src_indices_forward = range(x.shape[0]) trg_indices_forward = np.zeros(x.shape[0], dtype=int) best_sim_backward = np.full(z.shape[0], -100.) src_indices_backward = np.zeros(z.shape[0], dtype=int) trg_indices_backward = range(z.shape[0]) for i in range(0, x.shape[0], MAX_DIM_X): for j in range(0, z.shape[0], MAX_DIM_Z): sim = xw[i:i + MAX_DIM_X].dot(z[j:j + MAX_DIM_Z].T) if args.orthographic_sim: sim += (ortho_sim[i:i + MAX_DIM_X, j:j + MAX_DIM_Z]. toarray()) / ortho_sim_scale for k in range(sim.shape[0]): l = sim[k].argmax() if sim[k, l] > best_sim_forward[i + k]: best_sim_forward[i + k] = sim[k, l] trg_indices_forward[i + k] = j + l if args.direction in ( 'backward', 'union'): # Slow, only do if necessary for l in range(sim.shape[1]): k = sim[:, l].argmax() if sim[k, l] > best_sim_backward[j + l]: best_sim_backward[j + l] = sim[k, l] src_indices_backward[j + l] = i + k sim = None if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = np.concatenate( (src_indices_forward, src_indices_backward)) trg_indices = np.concatenate( (trg_indices_forward, trg_indices_backward)) # Objective function evaluation prev_objective = objective if args.direction == 'forward': objective = np.mean(best_sim_forward) elif args.direction == 'backward': objective = np.mean(best_sim_backward) elif args.direction == 'union': objective = (np.mean(best_sim_forward) + np.mean(best_sim_backward)) / 2 # Accuracy and similarity evaluation in validation if args.validation is not None: accuracy = np.mean([ 1 if trg_indices_forward[src] in trg else 0 for src, trg in validation.items() ]) similarity = np.mean([ np.max(z[list(trg)].dot(xw[src])) for src, trg in validation.items() ]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format( 100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format( it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, z, trgfile) srcfile.close() trgfile.close()
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map the source embeddings into the target embedding space' ) parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp64', help='the floating-point precision (defaults to fp64)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') mapping_group = parser.add_argument_group( 'mapping arguments', 'Basic embedding mapping arguments (EMNLP 2016)') mapping_group.add_argument( '-d', '--dictionary', default=sys.stdin.fileno(), help='the training dictionary file (defaults to stdin)') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument( '-c', '--orthogonal', dest='orthogonal', action='store_true', help='use orthogonal constrained mapping (default)') mapping_group.add_argument('-u', '--unconstrained', dest='orthogonal', action='store_false', help='use unconstrained mapping') parser.set_defaults(orthogonal=True) self_learning_group = parser.add_argument_group( 'self-learning arguments', 'Optional arguments for self-learning (ACL 2017)') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument( '--direction', choices=['forward', 'backward', 'union'], default='forward', help='the direction for dictionary induction (defaults to forward)') self_learning_group.add_argument( '--numerals', action='store_true', help= 'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary' ) self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--validation', default=None, help='a dictionary file for validation at each iteration') self_learning_group.add_argument( '--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build training dictionary src_indices = [] trg_indices = [] if args.numerals: if args.dictionary != sys.stdin.fileno(): print('WARNING: Using numerals instead of the training dictionary', file=sys.stderr) numeral_regex = re.compile('^[0-9]+$') src_numerals = { word for word in src_words if numeral_regex.match(word) is not None } trg_numerals = { word for word in trg_words if numeral_regex.match(word) is not None } numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Normalize embeddings for action in args.normalize: if action == 'unit': x = embeddings.length_normalize(x) z = embeddings.length_normalize(z) elif action == 'center': x = embeddings.mean_center(x) z = embeddings.mean_center(z) elif action == 'unitdim': x = embeddings.length_normalize_dimensionwise(x) z = embeddings.length_normalize_dimensionwise(z) elif action == 'centeremb': x = embeddings.mean_center_embeddingwise(x) z = embeddings.mean_center_embeddingwise(z) # Training loop prev_objective = objective = -100. it = 1 t = time.time() while it == 1 or objective - prev_objective >= args.threshold: # Update the embedding mapping if args.orthogonal: # orthogonal mapping u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) else: # unconstrained mapping x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot( x[src_indices])).dot(x[src_indices].T) w = x_pseudoinv.dot(z[trg_indices]) xw = x.dot(w) # Self-learning if args.self_learning: # Update the training dictionary best_sim_forward = xp.full(x.shape[0], -100, dtype=dtype) src_indices_forward = xp.arange(x.shape[0]) trg_indices_forward = xp.zeros(x.shape[0], dtype=int) best_sim_backward = xp.full(z.shape[0], -100, dtype=dtype) src_indices_backward = xp.zeros(z.shape[0], dtype=int) trg_indices_backward = xp.arange(z.shape[0]) for i in range(0, x.shape[0], MAX_DIM_X): j = min(x.shape[0], i + MAX_DIM_X) for k in range(0, z.shape[0], MAX_DIM_Z): l = min(z.shape[0], k + MAX_DIM_Z) sim = xw[i:j].dot(z[k:l].T) if args.direction in ('forward', 'union'): ind = sim.argmax(axis=1) val = sim[xp.arange(sim.shape[0]), ind] ind += k mask = (val > best_sim_forward[i:j]) best_sim_forward[i:j][mask] = val[mask] trg_indices_forward[i:j][mask] = ind[mask] if args.direction in ('backward', 'union'): ind = sim.argmax(axis=0) val = sim[ind, xp.arange(sim.shape[1])] ind += i mask = (val > best_sim_backward[k:l]) best_sim_backward[k:l][mask] = val[mask] src_indices_backward[k:l][mask] = ind[mask] if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = xp.concatenate( (src_indices_forward, src_indices_backward)) trg_indices = xp.concatenate( (trg_indices_forward, trg_indices_backward)) # Objective function evaluation prev_objective = objective if args.direction == 'forward': objective = xp.mean(best_sim_forward).tolist() elif args.direction == 'backward': objective = xp.mean(best_sim_backward).tolist() elif args.direction == 'union': objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) sim = xw[src].dot(z.T) # TODO Assuming that it fits in memory nn = asnumpy(sim.argmax(axis=1)) accuracy = np.mean([ 1 if nn[i] in validation[src[i]] else 0 for i in range(len(src)) ]) similarity = np.mean([ max([sim[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src)) ]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format( 100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format( it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # Write mapped embeddings srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') embeddings.write(src_words, xw, srcfile) embeddings.write(trg_words, z, trgfile) srcfile.close() trgfile.close()
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description= 'Evaluate embeddings of two languages in a shared space in word translation induction' ) parser.add_argument('src_embeddings', help='the source language embeddings') parser.add_argument('trg_embeddings', help='the target language embeddings') parser.add_argument('-d', '--dictionary', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') parser.add_argument( '--encoding', default='utf-8', action='store_true', help='the character encoding for input/output (defaults to utf-8)') args = parser.parse_args() # Read input embeddings srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape') src_words, src_matrix = embeddings.read(srcfile) trg_words, trg_matrix = embeddings.read(trgfile) # Length normalize embeddings so their dot product effectively computes the cosine similarity src_matrix = embeddings.length_normalize(src_matrix) trg_matrix = embeddings.length_normalize(trg_matrix) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Read dictionary and compute coverage f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') src2trg = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src2trg[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov coverage = len(src2trg) / (len(src2trg) + len(oov)) # Compute accuracy correct = 0 for src, trg in src2trg.items(): similarities = np.dot(trg_matrix, src_matrix[src]) closest = np.argmax(similarities) if closest in trg: correct += 1 print('Coverage:{0:7.2%} Accuracy:{1:7.2%}'.format( coverage, correct / len(src2trg)))
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('sense_input', help='the input sense mapping matrix') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument('tsns_output', default='tsns.pkl', help='the output target senses pickle file') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') recommended_group = parser.add_argument_group( 'recommended settings', 'Recommended settings for different scenarios') recommended_type = recommended_group.add_mutually_exclusive_group() recommended_type.add_argument( '--unsupervised', action='store_true', help= 'recommended if you have no seed dictionary and do not want to rely on identical words' ) recommended_type.add_argument('--future', action='store_true', help='experiment with stuff') recommended_type.add_argument('--toy', action='store_true', help='experiment with stuff on toy dataset') recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system') init_group = parser.add_argument_group( 'advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument( '--unsupervised_vocab', type=int, default=0, help= 'restrict the vocabulary to the top k entries for unsupervised initialization' ) mapping_group = parser.add_argument_group( 'advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') self_learning_group = parser.add_argument_group( 'advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument( '--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument( '--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument( '--stochastic_initial', default=0.1, type=float, help= 'initial keep probability stochastic dictionary induction (defaults to 0.1)' ) self_learning_group.add_argument( '--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument( '--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') self_learning_group.add_argument( '--log', default='map.log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') future_group = parser.add_argument_group('experimental arguments', 'Experimental arguments') future_group.add_argument('--skip_top', type=int, default=0, help='Top k words to skip, presumably function') future_group.add_argument( '--start_src', action='store_true', help='Algorithm starts by tuning sense embeddings based on source') future_group.add_argument('--trim_senses', action='store_true', help='Trim sense table to working vocab') future_group.add_argument( '--lamb', type=float, default=0.5, help='Weight hyperparameter for sense alignment objectives') future_group.add_argument('--reglamb', type=float, default=1., help='Lasso regularization hyperparameter') future_group.add_argument( '--ccreglamb', type=float, default=0.1, help='Sense embedding regularization hyperparameter') future_group.add_argument('--inv_delta', type=float, default=0.0001, help='Delta_I added for inverting sense matrix') future_group.add_argument('--lasso_iters', type=int, default=10, help='Number of iterations for LASSO/NMF') future_group.add_argument('--iterations', type=int, default=-1, help='Number of overall model iterations') future_group.add_argument('--trg_batch', type=int, default=5000, help='Batch size for target steps') future_group.add_argument( '--trg_knn', action='store_true', help='Perform target sense mapping by k-nearest neighbors') future_group.add_argument( '--trg_sns_csls', type=int, default=10, help='K-nearest neighbors for CSLS target sense search') future_group.add_argument( '--senses_per_trg', type=int, default=1, help='K-max target sense mapping (default = 1 = off)') future_group.add_argument( '--gd', action='store_true', help='Apply gradient descent for assignment and synset embeddings') future_group.add_argument('--gd_lr', type=float, default=1e-2, help='Learning rate for SGD (default=0.01)') future_group.add_argument('--gd_wd', action='store_true', help='Weight decay in SGD') future_group.add_argument( '--gd_wd_hl', type=int, default=100, help='Weight decay half-life in SGD, default=100') future_group.add_argument( '--gd_clip', type=float, default=5., help='Per-coordinate gradient clipping (default=5)') future_group.add_argument( '--gd_map_steps', type=int, default=1, help='Consecutive steps for each target-sense mapping update phase') future_group.add_argument( '--gd_emb_steps', type=int, default=1, help='Consecutive steps for each sense embedding update phase') future_group.add_argument( '--base_prox_lambda', type=float, default=0.99, help='Lambda for proximal gradient in lasso step') future_group.add_argument( '--prox_decay', action='store_true', help='Multiply proximal lambda by itself each iteration') future_group.add_argument( '--sense_limit', type=float, default=1.1, help= 'Maximum amount of target sense mappings, in terms of source mappings (default=1.1x)' ) future_group.add_argument( '--gold_pairs', help='Gold data for evaluation, if exists (not for tuning)') future_group.add_argument( '--gold_threshold', type=float, default=0.0, help='Threshold for gold mapping (0 is fine if sparse)') future_group.add_argument('--debug', action='store_true') args = parser.parse_args() # pre-setting groups if args.toy: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', vocabulary_cutoff=50, trim_senses=True, inv_delta=1., reglamb=0.2, lasso_iters=100, gd_wd=True, log='map-toy.log') if args.unsupervised or args.future: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', vocabulary_cutoff=2000, trim_senses=True, gd_wd=True) if args.unsupervised or args.acl2018: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', vocabulary_cutoff=20000) args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' # many operations not supported by cupy elif args.precision == 'fp32': # default dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings print('reading embeddings...') srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) print('embeddings read') # Read input source sense mapping print('reading sense mapping') src_senses = pickle.load(open(args.sense_input, 'rb')) if src_senses.shape[0] != x.shape[0]: src_senses = csr_matrix(src_senses.transpose() ) # using non-cuda scipy because of 'inv' impl #src_senses = get_sparse_module(src_senses) print( f'source sense mapping of shape {src_senses.shape} loaded with {src_senses.getnnz()} nonzeros' ) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) print('CUDA loaded') else: xp = np xp.random.seed(args.seed) # removed word to index map (only relevant in supervised learning or with validation) # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) print('normalization complete') # removed building the seed dictionary # removed validation step # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') print(f'logging into {args.log}') # Allocate memory # Initialize the projection matrices W(s) = W(t) = I. xw = xp.empty_like(x) zw = xp.empty_like(z) xw[:] = x zw[:] = z src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min( x.shape[0] - args.skip_top, args.vocabulary_cutoff) trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min( z.shape[0] - args.skip_top, args.vocabulary_cutoff) emb_dim = x.shape[1] cutoff_end = min(src_size + args.skip_top, x.shape[0]) if args.trim_senses: # reshape sense assignment src_senses = src_senses[args.skip_top:cutoff_end] # new columns for words with no senses in original input ### TODO might also need this if not trimming (probably kinda far away) newcols = [csc_matrix(([1],([i],[0])),shape=(src_size,1)) for i in range(src_size)\ if src_senses.getrow(i).getnnz() == 0] #with open(f'data/synsets/dummy_synsets_v3b_{src_size}','wb') as dummy_cols_file: # dummy_col_idcs = [i for i in range(src_size) if src_senses.getrow(i).getnnz() == 0] # pickle.dump(np.array(dummy_col_idcs), dummy_cols_file) # trim senses no longer used, add new ones colsums = src_senses.sum(axis=0).tolist()[0] kept_senses = [i for i, j in enumerate(colsums) if j > 0] #with open(f'data/synsets/kept_synsets_v3b_{src_size}','wb') as kept_save_file: # pickle.dump(np.array(kept_senses), kept_save_file) src_senses = hstack([src_senses[:, kept_senses]] + newcols) print( f'trimmed sense dictionary dimensions: {src_senses.shape} with {src_senses.getnnz()} nonzeros' ) sense_size = src_senses.shape[1] if args.gold_pairs is not None: with open(args.gold_pairs, 'rb') as gold_pairs_f: gold_pairs = pickle.load(gold_pairs_f) gold_pairs = [(i-args.skip_top,j) for i,j in gold_pairs \ if i >= args.skip_top and i < src_senses.shape[0] and j < src_senses.shape[1]] gold_trgs = sorted(set([x[0] for x in gold_pairs])) gold_senses = sorted(set([x[1] for x in gold_pairs])) gold_domain_size = len(gold_trgs) * len(gold_senses) print( f'evaluating on {len(gold_pairs)} pairs with {len(gold_trgs)} unique words and {len(gold_senses)} unique senses' ) # Initialize the concept embeddings from the source embeddings ### TODO maybe try gradient descent instead? ### TODO (pre-)create non-singular alignment matrix cc = xp.empty((sense_size, emb_dim), dtype=dtype) # \tilde{E} t01 = time.time() print('starting psinv calc') src_sns_psinv = psinv(src_senses, dtype, args.inv_delta) xecc = x[args.skip_top:cutoff_end].T.dot( get_sparse_module(src_senses).toarray()).T # sense_size * emb_dim cc[:] = src_sns_psinv.dot(xecc) print(f'initialized concept embeddings in {time.time()-t01:.2f} seconds', file=sys.stderr) if args.verbose: # report precision of psedo-inverse operation, checked by inverting pseudo_id = src_senses.transpose().dot(src_senses).dot( src_sns_psinv.get()) real_id = sparse_id(sense_size) rel_diff = (pseudo_id - real_id).sum() / (sense_size * sense_size) print(f'per-coordinate pseudo-inverse precision is {rel_diff:.5f}') ### TODO initialize trg_senses using seed dictionary instead? trg_sns_size = trg_size if args.trim_senses else z.shape[0] trg_senses = csr_matrix( (trg_sns_size, sense_size)) # using non-cuda scipy because of 'inv' impl zecc = xp.empty_like(xecc) # sense_size * emb_dim #tg_grad = xp.empty((trg_sns_size, sense_size)) if args.gd: # everything can be done on gpu src_senses = get_sparse_module(src_senses, dtype=dtype) trg_senses = get_sparse_module(trg_senses, dtype=dtype) if args.sense_limit > 0.0: trg_sense_limit = int(args.sense_limit * src_senses.getnnz()) if args.verbose: print( f'limiting target side to {trg_sense_limit} sense mappings' ) else: trg_sense_limit = -1 ### TODO return memory assignment for similarities? # Training loop if args.gd: prox_lambda = args.base_prox_lambda else: lasso_model = Lasso(alpha=args.reglamb, fit_intercept=False, max_iter=args.lasso_iters,\ positive=True, warm_start=True) # TODO more parametrization if args.log is not None: if args.gd: print(f'gradient descent lr: {args.gd_lr}', file=log) print(f'base proximal lambda: {args.base_prox_lambda}', file=log) else: print(f'lasso regularization: {args.reglamb}', file=log) print(f'lasso iterations: {args.lasso_iters}', file=log) print(f'inversion epsilon: {args.inv_delta}', file=log) if args.gold_pairs is not None: print(f'gold mappings: {len(gold_pairs)}', file=log) print( f'Iteration\tObjective\tSource\tTarget\tL_1\tDuration\tNonzeros\tCorrect_mappings', file=log) log.flush() best_objective = objective = 1000000000. correct_mappings = -1 regularization_lambda = args.base_prox_lambda if args.gd else args.reglamb it = 1 last_improvement = 0 t = time.time() map_gd_lr = args.gd_lr emb_gd_lr = args.gd_lr end = False print('starting training') if args.start_src: print('starting with converging synset embeddings') it_range = range( args.iterations ) ### TODO possibly add arg, but there's early stopping if not args.verbose: it_range = tqdm(it_range) prev_obj = float('inf') for pre_it in it_range: if args.gd_wd: emb_gd_lr = args.gd_lr * pow(0.5, floor( pre_it / args.gd_wd_hl)) # Synset embedding cc_grad = src_senses.T.dot( xw[args.skip_top:cutoff_end] - src_senses.dot(cc)) - args.ccreglamb * cc cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad) cc += emb_gd_lr * cc_grad # Source projection u, s, vt = xp.linalg.svd(cc.T.dot(xecc)) wx = vt.T.dot(u.T).astype(dtype) x.dot(wx, out=xw) pre_objective = ((xp.linalg.norm( xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2 pre_objective = float(pre_objective) if args.verbose and pre_it > 0 and pre_it % 10 == 0: print( f'source synset embedding objective iteration {pre_it}: {pre_objective:.3f}' ) if pre_objective > prev_obj: print( f'stopping at pre-iteration {pre_it}, source-sense objective {prev_obj:.3f}' ) # revert cc -= emb_gd_lr * cc_grad break prev_obj = pre_objective while True: if it % 50 == 0: print( f'starting iteration {it}, last objective was {objective}, correct mappings at {correct_mappings}' ) # Increase the keep probability if we have not improved in args.stochastic_interval iterations if it - last_improvement > args.stochastic_interval: last_improvement = it if args.iterations > 0 and it > args.iterations: end = True ### update target assignments (6) - lasso-esque regression time6 = time.time() # optimize: 0.5 * (xp.linalg.norm(zw[i] - trg_senses[i].dot(cc))^2) + (regularization_lambda * xp.linalg.norm(trg_senses[i],1)) if args.trg_knn: # for csls-based neighborhoods knn_sense = xp.full(sense_size, -100) for i in range(0, sense_size, args.trg_batch): batch_end = min(i + args.trg_batch, sense_size) sim_sense_trg = cc[i:batch_end].dot( zw[args.skip_top:cutoff_end].T) knn_sense[i:batch_end] = topk_mean(sim_sense_trg, k=args.trg_sns_csls, inplace=True) # calculate new target mappings trg_senses = lil_matrix(trg_senses.shape) for i in range(0, trg_size, args.trg_batch): sns_batch_end = min(i + args.trg_batch, trg_size) z_i = i + args.skip_top z_batch_end = min(sns_batch_end + args.skip_top, zw.shape[0]) sims = zw[z_i:z_batch_end].dot(cc.T) sims -= knn_sense / 2 # equivalent to the real CSLS scores for NN best_idcs = sims.argmax(1).tolist() trg_senses[(list(range(i, sns_batch_end)), best_idcs)] = sims.max(1).tolist() # second-to-lth-best for l in range(args.senses_per_trg - 1): sims[(list(range(sims.shape[0])), best_idcs)] = 0. best_idcs = sims.argmax(1).tolist() trg_senses[(list(range(i, sns_batch_end)), best_idcs)] = sims.max(1).tolist() trg_senses = get_sparse_module(trg_senses.tocsr()) elif args.gd: ### TODO add args.skip_top calculations if args.gd_wd: true_it = (it - 1) * args.gd_map_steps map_gd_lr = args.gd_lr * pow( 0.5, floor((1 + true_it) / args.gd_wd_hl)) if args.verbose: print(f'mapping learning rate: {map_gd_lr}') for k in range(args.gd_map_steps): # st <- st + eta * (ew - st.dot(es)).dot(es.T) # allow up to sense_limit updates, clip gradient batch_grads = [] for i in range(0, trg_size, args.trg_batch): batch_end = min(i + args.trg_batch, trg_size) tg_grad_b = (zw[i:batch_end] - trg_senses[i:batch_end].dot(cc)).dot(cc.T) # proximal gradient tg_grad_b += prox_lambda tg_grad_b.clip(None, 0.0, out=tg_grad_b) batch_grads.append(batch_sparse(tg_grad_b)) tg_grad = get_sparse_module(vstack(batch_grads)) del tg_grad_b if args.prox_decay: prox_lambda *= args.base_prox_lambda ### TODO consider weight decay here as well (args.gd_wd) trg_senses -= map_gd_lr * tg_grad # allow up to sense_limit nonzeros if trg_sense_limit > 0: trg_senses = trim_sparse(trg_senses, trg_sense_limit, clip=None) ### TODO consider finishing up with lasso (maybe only in final iteration) else: ### TODO add args.skip_top calculations # parallel LASSO (no cuda impl) cccpu = cc.get().T # emb_dim * sense_size lasso_model.fit(cccpu, zw[:trg_size].get().T) ### TODO maybe trim, keep only above some threshold (0.05) OR top f(#it) trg_senses = lasso_model.sparse_coef_ if args.verbose: print( f'target sense mapping step: {(time.time()-time6):.2f} seconds, {trg_senses.getnnz()} nonzeros', file=sys.stderr) objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro') ** 2)\ + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \ + regularization_lambda * trg_senses.sum() # TODO consider thresholding reg part objective = float(objective) print(f'objective: {objective:.3f}') # Write target sense mapping with open(f'tmp_outs/{args.tsns_output[:-4]}-it{it:03d}.pkl', mode='wb') as tsnsfile: pickle.dump(trg_senses.get(), tsnsfile) ### update synset embeddings (10) time10 = time.time() if args.gd and args.gd_emb_steps > 0: ### TODO probably handle sizes and/or threshold sparse matrix if args.gd_wd: true_it = (it - 1) * args.gd_emb_steps emb_gd_lr = args.gd_lr * pow( 0.5, floor((1 + true_it) / args.gd_wd_hl)) if args.verbose: print(f'embedding learning rate: {emb_gd_lr}') ### replace block for no-source-tuning mode all_senses = trg_senses if args.start_src else get_sparse_module( vstack((src_senses.get(), trg_senses.get()), format='csr'), dtype=dtype) aw = zw[args. skip_top:cutoff_end] if args.start_src else xp.concatenate( (xw[args.skip_top:cutoff_end], zw[args.skip_top:cutoff_end])) for i in range(args.gd_emb_steps): cc_grad = all_senses.T.dot( aw - all_senses.dot(cc)) - args.ccreglamb * cc cc_grad.clip(-args.gd_clip, args.gd_clip, out=cc_grad) cc += emb_gd_lr * cc_grad else: ### TODO add args.skip_top calculations all_senses = get_sparse_module( vstack((src_senses, trg_senses), format='csr')) xzecc = xp.concatenate((xw[:src_size], zw[:trg_size])).T\ .dot(all_senses.toarray()).T # sense_size * emb_dim all_sns_psinv = psinv( all_senses.get(), dtype, args.inv_delta ) ### TODO only update target side? We still have src_sns_psinv [it doesn't matter, dimensions are the same] cc[:] = all_sns_psinv.dot(xzecc) if args.verbose: print(f'synset embedding update: {time.time()-time10:.2f}', file=sys.stderr) objective = ((xp.linalg.norm(xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc),'fro')) ** 2\ + (xp.linalg.norm(zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc),'fro')) ** 2) / 2 \ + regularization_lambda * trg_senses.sum() # TODO consider thresholding reg part objective = float(objective) print(f'objective: {objective:.3f}') ### update projections (3,5) # write to zw and xw if args.orthogonal or not end: ### remove block for no-source-tuning mode # source side - mappings don't change so xecc is constant #if not args.start_src: # need to do this anyway whenever cc updates time3 = time.time() u, s, vt = xp.linalg.svd(cc.T.dot(xecc)) wx = vt.T.dot(u.T).astype(dtype) x.dot(wx, out=xw) if args.verbose: print(f'source projection update: {time.time()-time3:.2f}', file=sys.stderr) # target side - compute sense mapping first time3 = time.time() zecc.fill(0.) for i in range(0, trg_size, args.trg_batch): end_idx = min(i + args.trg_batch, trg_size) zecc += z[i:end_idx].T.dot( get_sparse_module(trg_senses[i:end_idx]).toarray()).T u, s, vt = xp.linalg.svd(cc.T.dot(zecc)) wz = vt.T.dot(u.T).astype(dtype) z.dot(wz, out=zw) if args.verbose: print(f'target projection update: {time.time()-time3:.2f}', file=sys.stderr) ### TODO add parts from 'advanced mapping' part - transformations, whitening, etc. # Objective function evaluation time_obj = time.time() trg_senses_l1 = float(trg_senses.sum()) src_obj = (float( xp.linalg.norm( xw[args.skip_top:cutoff_end] - get_sparse_module(src_senses).dot(cc), 'fro'))**2) / 2 trg_obj = (float( xp.linalg.norm( zw[args.skip_top:cutoff_end] - get_sparse_module(trg_senses).dot(cc), 'fro'))**2) / 2 objective = src_obj + trg_obj + regularization_lambda * trg_senses_l1 # TODO consider thresholding reg part if args.verbose: print(f'objective calculation: {time.time()-time_obj:.2f}', file=sys.stderr) if objective - best_objective <= -args.threshold: last_improvement = it best_objective = objective # WordNet transduction evaluation (can't tune on this) if args.gold_pairs is not None: np_trg_senses = trg_senses.get() trg_corr = [ p for p in gold_pairs if np_trg_senses[p] > args.gold_threshold ] correct_mappings = len(trg_corr) domain_trgs = np_trg_senses[gold_trgs][:, gold_senses] else: correct_mappings = -1 # Logging duration = time.time() - t if args.verbose: print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('objective: {0:.3f}'.format(objective), file=sys.stderr) print('target senses l_1 norm: {0:.3f}'.format(trg_senses_l1), file=sys.stderr) if len(gold_pairs) > 0 and domain_trgs.getnnz() > 0: print( f'{correct_mappings} correct target mappings: {(correct_mappings/len(gold_pairs)):.3f} recall, {(correct_mappings/domain_trgs.getnnz()):.3f} precision', file=sys.stderr) print(file=sys.stderr) sys.stderr.flush() if args.log is not None: print( f'{it}\t{objective:.3f}\t{src_obj:.3f}\t{trg_obj:.3f}\t{trg_senses_l1:.3f}\t{duration:.3f}\t{trg_senses.getnnz()}\t{correct_mappings}', file=log) log.flush() if end: break t = time.time() it += 1 # Write mapped embeddings with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile: embeddings.write(src_words, xw, srcfile) with open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile: embeddings.write(trg_words, zw, trgfile) # Write target sense mapping with open(args.tsns_output, mode='wb') as tsnsfile: pickle.dump(trg_senses.get(), tsnsfile)
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description= 'Evaluate embeddings of two languages in a shared space in word translation induction' ) parser.add_argument('src_embeddings', help='the source language embeddings') parser.add_argument('trg_embeddings', help='the target language embeddings') parser.add_argument('-d', '--dictionary', default=sys.stdin.fileno(), help='the test dictionary file (defaults to stdin)') parser.add_argument('-o', '--output-dictionary', default=sys.stdin.fileno(), help='path to the output dictionary') parser.add_argument( '--retrieval', default='nn', choices=['nn', 'invnn', 'invsoftmax', 'csls'], help= 'the retrieval method (nn: standard nearest neighbor; invnn: inverted nearest neighbor; invsoftmax: inverted softmax; csls: cross-domain similarity local scaling)' ) parser.add_argument( '--inv_temperature', default=1, type=float, help='the inverse temperature (only compatible with inverted softmax)') parser.add_argument( '--inv_sample', default=None, type=int, help= 'use a random subset of the source vocabulary for the inverse computations (only compatible with inverted softmax)' ) parser.add_argument( '-k', '--neighborhood', default=10, type=int, help='the neighborhood size (only compatible with csls)') parser.add_argument( '--dot', action='store_true', help= 'use the dot product in the similarity computations instead of the cosine' ) parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--seed', type=int, default=0, help='the random seed') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') args = parser.parse_args() # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings srcfile = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np xp.random.seed(args.seed) # Length normalize embeddings so their dot product effectively computes the cosine similarity if not args.dot: embeddings.length_normalize(x) embeddings.length_normalize(z) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} src_ind2word = {i: word for i, word in enumerate(src_words)} trg_ind2word = {i: word for i, word in enumerate(trg_words)} # Read dictionary and compute coverage f = open(args.dictionary, encoding=args.encoding, errors='surrogateescape') src = set() for line in f: src_word = line.split()[0] src_ind = src_word2ind.get(src_word, None) if src_ind is not None: src.add(src_ind) src = list(src) # Find translations translation = collections.defaultdict(int) if args.retrieval == 'nn': # Standard nearest neighbor for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = x[src[i:j]].dot(z.T) nn = similarities.argmax(axis=1).tolist() for k in range(j - i): translation[src[i + k]] = nn[k] elif args.retrieval == 'invnn': # Inverted nearest neighbor best_rank = np.full(len(src), x.shape[0], dtype=int) best_sim = np.full(len(src), -100, dtype=dtype) for i in range(0, z.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, z.shape[0]) similarities = z[i:j].dot(x.T) ind = (-similarities).argsort(axis=1) ranks = asnumpy(ind.argsort(axis=1)[:, src]) sims = asnumpy(similarities[:, src]) for k in range(i, j): for l in range(len(src)): rank = ranks[k - i, l] sim = sims[k - i, l] if rank < best_rank[l] or (rank == best_rank[l] and sim > best_sim[l]): best_rank[l] = rank best_sim[l] = sim translation[src[l]] = k elif args.retrieval == 'invsoftmax': # Inverted softmax sample = xp.arange( x.shape[0]) if args.inv_sample is None else xp.random.randint( 0, x.shape[0], args.inv_sample) partition = xp.zeros(z.shape[0]) for i in range(0, len(sample), BATCH_SIZE): j = min(i + BATCH_SIZE, len(sample)) partition += xp.exp(args.inv_temperature * z.dot(x[sample[i:j]].T)).sum(axis=1) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) p = xp.exp(args.inv_temperature * x[src[i:j]].dot(z.T)) / partition nn = p.argmax(axis=1).tolist() for k in range(j - i): translation[src[i + k]] = nn[k] elif args.retrieval == 'csls': # Cross-domain similarity local scaling knn_sim_bwd = xp.zeros(z.shape[0]) for i in range(0, z.shape[0], BATCH_SIZE): j = min(i + BATCH_SIZE, z.shape[0]) knn_sim_bwd[i:j] = topk_mean(z[i:j].dot(x.T), k=args.neighborhood, inplace=True) for i in range(0, len(src), BATCH_SIZE): j = min(i + BATCH_SIZE, len(src)) similarities = 2 * x[src[i:j]].dot( z.T) - knn_sim_bwd # Equivalent to the real CSLS scores for NN nn = similarities.argmax(axis=1).tolist() for k in range(j - i): translation[src[i + k]] = nn[k] with open(args.output_dictionary, 'w') as fout: for src_ind in src: trg_ind = translation[src_ind] src_word = src_ind2word[src_ind] trg_word = trg_ind2word[trg_ind] fout.write('\t'.join([src_word, trg_word]) + '\n') fout.close()
def filter_embeddings(in_embfname, filter_func): embeddings.read(in_embfile, max_voc=max_voc)
def main(): # Parse command line arguments parser = argparse.ArgumentParser( description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument( '--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') parser.add_argument( '--batch_size', default=10000, type=int, help= 'batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory' ) parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') parser.add_argument('--maxiter', type=int, default=10, help='max number of iterations') parser.add_argument('--corekbest', type=int, default=2, help='nn ranking to be considered as a match') parser.add_argument('--decayrate', type=float, default=1.01, help='for boosting') parser.add_argument('--init_vocab', type=int, default=10000, help='for boosting') parser.add_argument('--dictname', default='dict.tmp', help='output the dictionary') recommended_type = parser.add_argument_group( 'recommended settings', 'Recommended settings for different scenarios') recommended_type.add_argument( '--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') recommended_type.add_argument( '--identical', default=True, help= 'recommended if you have no seed dictionary but can rely on identical words' ) init_group = parser.add_argument_group( 'advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument( '-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') init_type.add_argument( '--init_numerals', action='store_true', help= 'use latin numerals (i.e. words matching [0-9]+) as the seed dictionary' ) init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument( '--unsupervised_vocab', type=int, default=0, help= 'restrict the vocabulary to the top k entries for unsupervised initialization' ) mapping_group = parser.add_argument_group( 'advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument( '--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--vocabulary', help='restrict source vocab') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group( 'advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument( '--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') self_learning_group.add_argument( '--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') self_learning_group.add_argument( '--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument( '-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit']) args = parser.parse_args() print(args, file=sys.stderr) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' os.makedirs(OUTPUTDIR, exist_ok=True) # Read input embeddings vocabulary = None if args.vocabulary is not None: vocabulary = set() with open(args.vocabulary, encoding=args.encoding, errors='surrogateescape') as file: for l in file: vocabulary.add(l.split()[0]) print(f'vocab size:\t{len(vocabulary)}') with open(args.src_input, encoding=args.encoding, errors='surrogateescape') as srcfile, \ open(args.trg_input, encoding=args.encoding, errors='surrogateescape') as trgfile: src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=args.vocabulary_cutoff, vocabulary=vocabulary) trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=args.vocabulary_cutoff) embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np xp.random.seed(args.seed) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # Build the seed dictionary src_indices = [] trg_indices = [] if args.supervised: f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: try: src, trg = line.split()[:2] except ValueError: continue try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format( src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: print('reading validation', file=sys.stderr) f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: try: src, trg = line.split() except ValueError: continue try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Allocate memory xw = xp.empty_like(x) zw = xp.empty_like(z) matches = collections.Counter() decided = collections.Counter() cum_weights = collections.Counter(matches) score = collections.Counter() for p in zip(src_indices, trg_indices): matches[p] = 1 decided[p] = 1 identical = set(src_words).intersection(set(trg_words)) for word in list(identical): p = (src_word2ind[word], trg_word2ind[word]) matches[p] = 1 decided[p] = 1 if args.validation is not None: simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype) # Training loop it = 1 t = time.time() wprev = 0 current_vocab = args.init_vocab Stats = collections.namedtuple( 'MatchStats', ['w_dot', 'mean_dot', 'delta_w', 'current_vocab', 'len_match']) pstats = None stats = None while True: src_indices, trg_indices, weights = flatten_match(matches, matches) # x, z = np.array(x0), np.array(z0) embeddings.noise(x) embeddings.noise(z) if args.unconstrained: w = np.linalg.lstsq(np.sqrt(weights) * x[src_indices], np.sqrt(weights) * z[trg_indices], rcond=None)[0] # w = np.linalg.lstsq(x[src_indices], z[trg_indices], rcond=None)[0] x.dot(w, out=xw) zw = z[:] else: u, s, vt = xp.linalg.svd( (weights * z[trg_indices]).T.dot(x[src_indices])) # u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) x.dot(w, out=xw) zw = z[:] w_dot = np.sum( weights * z[trg_indices] * xw[src_indices]) / weights.sum() mean_dot = np.sum( z[trg_indices] * xw[src_indices]) / len(src_indices) delta_w = np.linalg.norm(w - wprev) stats = Stats(w_dot=w_dot, mean_dot=mean_dot, delta_w=delta_w, current_vocab=current_vocab, len_match=len(src_indices)) if it > 1 and stats.w_dot < pstats.w_dot: current_vocab = min(int(current_vocab * 1.1), args.vocabulary_cutoff) T = 1 * np.exp((it - 1) * np.log(1e-2) / (args.maxiter)) # T = 1 score = collections.Counter() cum_weights = collections.Counter() matches, objective = find_matches(xw, zw, cum_weights, score, ul=current_vocab, T=T, kbest=args.corekbest, csls=args.csls_neighborhood, decay=args.decayrate) for m in decided: decided[m] = decided[m] * (1 - 1 / it) for m in score: if m in score: eta = 1 / it else: eta = max(0.5, 1 / it) decided[m] = decided[m] * (1 - eta) + score[m] * eta # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) xw[src].dot(zw.T, out=simval) nn = asnumpy(simval.argmax(axis=1)) accuracy = np.mean([ 1 if nn[i] in validation[src[i]] else 0 for i in range(len(src)) ]) similarity = np.mean([ np.max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src)) ]) with open(f'{OUTPUTDIR}/{args.dictname}.{it}', mode='w') as f: for p in decided.most_common(): si, ti = p[0] print(f'{src_words[si]}\t{trg_words[ti]}\t{p[1]:.3e}', file=f) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) print( f'\t- #match/#decided: {len(src_indices)}/{len(decided)}', file=sys.stderr) print(stats, file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format( 100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log) log.flush() if it >= args.maxiter: break t = time.time() wprev = w pstats = stats it += 1 # write mapped embeddings print('**** reading and writing final embeddings ****', file=sys.stderr) with open(args.src_input, encoding=args.encoding, errors='surrogateescape') as srcfile, \ open(args.trg_input, encoding=args.encoding, errors='surrogateescape') as trgfile: src_words, x = embeddings.read(srcfile, dtype=dtype, threshold=100000) trg_words, z = embeddings.read(trgfile, dtype=dtype, threshold=100000) embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) with open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') as srcfile, \ open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') as trgfile: embeddings.write(src_words, x.dot(w), srcfile) embeddings.write(trg_words, z, trgfile)
def add_oov_embeddings(train_dict_fname, test_dict_fname, src_emb_fname, tgt_emb_fname, out_src_emb_fname, out_tgt_emb_fname, src_model_path, tgt_model_path, fast_text_binary_path, max_voc=200000, emb_format='txt'): """ Adds the embeddings for OOV words in the training and test dictionaries to the embedding file. This is done by computing the embeddings using FastText. So, this method applies to FastText embeddings only. Note that the output embedding file will contain only the OOV words plus the first max_voc words in the original embedding file. train_dict_fname: test_dict_fname: src_emb_fname: embedding file for source language tgt_emb_fname: embedding file for target language out_src_emb_fname: output embedding file for source language out_tgt_emb_fname: output embedding file for target language src_model_path: fasttext model for source language tgt_model_path: fasttext model for targetqa language fast_text_binary_path: path to fasttext binary max_voc: number of vocab items to process from the embedding file emb_format: format of embedding files. Currently supported: 'txt' - standard fast text format """ ## read dictionaries train_dict = read_bilingual_dict(train_dict_fname) test_dict = read_bilingual_dict(test_dict_fname) # read embeddings src_vcb_words = None src_emb = None tgt_vcb_words = None tgt_emb = None with open(src_emb_fname, 'r', encoding='utf-8' ) as src_emb_file, \ open(tgt_emb_fname, 'r', encoding='utf-8' ) as tgt_emb_file: src_vcb_words, src_emb = embeddings.read(src_emb_file, max_voc) tgt_vcb_words, tgt_emb = embeddings.read(tgt_emb_file, max_voc) ## find OOVs src_oov_words = set() src_oov_words.update(train_dict.keys()) src_oov_words.update(test_dict.keys()) src_oov_words.difference_update(src_vcb_words) print('Number of src OOV words: {}'.format(len(src_oov_words))) tgt_oov_words = set() tgt_oov_words.update(it.chain(train_dict.values())) tgt_oov_words.update(it.chain(test_dict.values())) tgt_oov_words.difference_update(tgt_vcb_words) print('Number of tgt OOV words: {}'.format(len(tgt_oov_words))) ## compute embeddings for OOV ##### cat queries.txt | ./fasttext print-word-vectors model.bin src_oov_final_words, src_oov_emb = compute_fasttext_embeddings( src_oov_words, src_model_path, fast_text_binary_path) tgt_oov_final_words, tgt_oov_emb = compute_fasttext_embeddings( tgt_oov_words, tgt_model_path, fast_text_binary_path) if (len(src_oov_words) != len(src_oov_final_words)): print( 'WARNING: Embeddings not computed for {} words out of {} OOV source words' .format( len(src_oov_words) - len(src_oov_final_words), len(src_oov_words))) if (len(tgt_oov_words) != len(tgt_oov_final_words)): print( 'WARNING: Embeddings not computed for {} words out of {} OOV target words' .format( len(tgt_oov_words) - len(tgt_oov_final_words), len(tgt_oov_words))) ## write new embeddings files to disk ## put the OOV words first followed by words in the original embeddings file with open(out_src_emb_fname, 'w', encoding='utf-8' ) as out_src_emb_file, \ open(out_tgt_emb_fname, 'w', encoding='utf-8' ) as out_tgt_emb_file: embeddings.write(src_oov_final_words + src_vcb_words, np.concatenate([src_oov_emb, src_emb]), out_src_emb_file) embeddings.write(tgt_oov_final_words + tgt_vcb_words, np.concatenate([tgt_oov_emb, tgt_emb]), out_tgt_emb_file)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space') parser.add_argument('src_input', help='the input source embeddings') parser.add_argument('trg_input', help='the input target embeddings') parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('-e', '--epochs', type=int, default=500, help='number of iterations') parser.add_argument('--pickle', action='store_true', help='load embedding from pickled object') parser.add_argument('--trg_output', help='the output target embeddings') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory') parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios') recommended_type = recommended_group.add_mutually_exclusive_group() recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary') recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words') recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words') recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system') recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system') recommended_type.add_argument('--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization') recommended_type.add_argument('--acl2017_seed', metavar='DICTIONARY', help='reproduce our ACL 2017 system with a seed dictionary') recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system') init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments') init_type = init_group.add_mutually_exclusive_group() init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary') init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization') mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)') self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)') self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration') self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() if args.supervised is not None: parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.semi_supervised is not None: parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.identical: parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.unsupervised or args.acl2018: parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) if args.aaai2018: parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) if args.acl2017: parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.acl2017_seed: parser.set_defaults(init_dictionary=args.acl2017_seed, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) if args.emnlp2016: parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000) args = parser.parse_args() # Check command line arguments if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: print('ERROR: De-whitening requires whitening first', file=sys.stderr) sys.exit(-1) # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings if args.pickle: with open(args.src_input, 'rb') as fin: src_words, x = pickle.load(fin) with open(args.trg_input, 'rb') as fin: trg_words, z = pickle.load(fin) else: srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') src_words, x = embeddings.read(srcfile, dtype=dtype) trg_words, z = embeddings.read(trgfile, dtype=dtype) # NumPy/CuPy management if args.cuda: if not supports_cupy(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() x = xp.asarray(x) z = xp.asarray(z) else: xp = np xp.random.seed(args.seed) # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} # STEP 0: Normalization embeddings.normalize(x, args.normalize) embeddings.normalize(z, args.normalize) # Build the seed dictionary src_indices = [] trg_indices = [] if args.init_unsupervised: sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab) u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False) xsim = (u*s).dot(u.T) u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False) zsim = (u*s).dot(u.T) del u, s, vt xsim.sort(axis=1) zsim.sort(axis=1) embeddings.normalize(xsim, args.normalize) embeddings.normalize(zsim, args.normalize) sim = xsim.dot(zsim.T) if args.csls_neighborhood > 0: knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood) knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood) sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2 if args.direction == 'forward': src_indices = xp.arange(sim_size) trg_indices = sim.argmax(axis=1) elif args.direction == 'backward': src_indices = sim.argmax(axis=0) trg_indices = xp.arange(sim_size) elif args.direction == 'union': src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0))) trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size))) del xsim, zsim, sim elif args.init_numerals: numeral_regex = re.compile('^[0-9]+$') src_numerals = {word for word in src_words if numeral_regex.match(word) is not None} trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None} numerals = src_numerals.intersection(trg_numerals) for word in numerals: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) elif args.init_identical: identical = set(src_words).intersection(set(trg_words)) for word in identical: src_indices.append(src_word2ind[word]) trg_indices.append(trg_word2ind[word]) else: f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] src_indices.append(src_ind) trg_indices.append(trg_ind) except KeyError: print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) # Read validation dictionary if args.validation is not None: f = open(args.validation, encoding=args.encoding, errors='surrogateescape') validation = collections.defaultdict(set) oov = set() vocab = set() for line in f: src, trg = line.split() try: src_ind = src_word2ind[src] trg_ind = trg_word2ind[trg] validation[src_ind].add(trg_ind) vocab.add(src) except KeyError: oov.add(src) oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov validation_coverage = len(validation) / (len(validation) + len(oov)) # Create log file if args.log: log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') # Allocate memory xw = xp.empty_like(x) zw = xp.empty_like(z) src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff) trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff) simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype) simbwd = xp.empty((args.batch_size, src_size), dtype=dtype) if args.validation is not None: simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype) best_sim_forward = xp.full(src_size, -100, dtype=dtype) src_indices_forward = xp.arange(src_size) trg_indices_forward = xp.zeros(src_size, dtype=int) best_sim_backward = xp.full(trg_size, -100, dtype=dtype) src_indices_backward = xp.zeros(trg_size, dtype=int) trg_indices_backward = xp.arange(trg_size) knn_sim_fwd = xp.zeros(src_size, dtype=dtype) knn_sim_bwd = xp.zeros(trg_size, dtype=dtype) # Training loop best_objective = objective = -100. it = 1 last_improvement = 0 keep_prob = args.stochastic_initial t = time.time() end = not args.self_learning epoch = 0 while True: epoch += 1 if epoch == args.epochs: keep_prob = 1.0 if epoch == args.epochs + 50: end = True # Increase the keep probability if we have not improve in args.stochastic_interval iterations if it - last_improvement > args.stochastic_interval: if keep_prob >= 1.0: end = True keep_prob = min(1.0, args.stochastic_multiplier*keep_prob) last_improvement = it # Update the embedding mapping if args.orthogonal or not end: # orthogonal mapping u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) w = vt.T.dot(u.T) x.dot(w, out=xw) zw[:] = z elif args.unconstrained: # unconstrained mapping x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T) w = x_pseudoinv.dot(z[trg_indices]) x.dot(w, out=xw) zw[:] = z else: # advanced mapping # TODO xw.dot(wx2, out=xw) and alike not working xw[:] = x zw[:] = z # STEP 1: Whitening def whitening_transformation(m): u, s, vt = xp.linalg.svd(m, full_matrices=False) return vt.T.dot(xp.diag(1/s)).dot(vt) if args.whiten: wx1 = whitening_transformation(xw[src_indices]) wz1 = whitening_transformation(zw[trg_indices]) xw = xw.dot(wx1) zw = zw.dot(wz1) # STEP 2: Orthogonal mapping wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices])) wz2 = wz2_t.T xw = xw.dot(wx2) zw = zw.dot(wz2) # STEP 3: Re-weighting xw *= s**args.src_reweight zw *= s**args.trg_reweight # STEP 4: De-whitening if args.src_dewhiten == 'src': xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.src_dewhiten == 'trg': xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) if args.trg_dewhiten == 'src': zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) elif args.trg_dewhiten == 'trg': zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) # STEP 5: Dimensionality reduction if args.dim_reduction > 0: xw = xw[:, :args.dim_reduction] zw = zw[:, :args.dim_reduction] # Self-learning if end: break else: # Update the training dictionary if args.direction in ('forward', 'union'): if args.csls_neighborhood > 0: for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j]) simfwd[:j-i] -= knn_sim_bwd/2 # Equivalent to the real CSLS scores for NN dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j]) if args.direction in ('backward', 'union'): if args.csls_neighborhood > 0: for i in range(0, src_size, simfwd.shape[0]): j = min(i + simfwd.shape[0], src_size) xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True) for i in range(0, trg_size, simbwd.shape[0]): j = min(i + simbwd.shape[0], trg_size) zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j]) simbwd[:j-i] -= knn_sim_fwd/2 # Equivalent to the real CSLS scores for NN dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1, out=src_indices_backward[i:j]) if args.direction == 'forward': src_indices = src_indices_forward trg_indices = trg_indices_forward elif args.direction == 'backward': src_indices = src_indices_backward trg_indices = trg_indices_backward elif args.direction == 'union': src_indices = xp.concatenate((src_indices_forward, src_indices_backward)) trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward)) # Objective function evaluation if args.direction == 'forward': objective = xp.mean(best_sim_forward).tolist() elif args.direction == 'backward': objective = xp.mean(best_sim_backward).tolist() elif args.direction == 'union': objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 if objective - best_objective >= args.threshold: last_improvement = it best_objective = objective # Accuracy and similarity evaluation in validation if args.validation is not None: src = list(validation.keys()) xw[src].dot(zw.T, out=simval) nn = asnumpy(simval.argmax(axis=1)) accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))]) similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))]) # Logging duration = time.time() - t if args.verbose: print(file=sys.stderr) print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr) if args.validation is not None: print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) print('\t- Val. coverage: {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr) sys.stderr.flush() if args.log is not None: val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log) log.flush() t = time.time() it += 1 # Write mapped embeddings # srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') # trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') dic = { 'W_source': asnumpy(w), 'W_target': np.identity(300, dtype=np.float32), 'source_lang': 'en', 'target_lang': args.trg_input.split('/')[1][:2] if args.trg_input.endswith('.bin') else args.trg_input.split('/')[5:7], 'model': 'ubi', 'note': 'vecmap', } with open(args.src_output, 'wb') as fout: pickle.dump(dic, fout)
def translate(src_emb_fname, tgt_emb_fname, trans_tgt_fname=None, trans_src_fname=None, retrieval_method="csls", csls_k=10, batch_size=2500): print('Loading train data...') srcfile = open(src_emb_fname, 'r', encoding='utf-8', errors='surrogateescape') tgtfile = open(tgt_emb_fname, 'r', encoding='utf-8', errors='surrogateescape') # Read source embeddings src_words, x = embeddings.read(srcfile, max_voc=0, dtype='float32') src_word2ind = {word: i for i, word in enumerate(src_words)} # Read target embeddings tgt_words, z = embeddings.read(tgtfile, max_voc=0, dtype='float32') tgt_word2ind = {word: i for i, word in enumerate(tgt_words)} srcfile.close() tgtfile.close() xw = embeddings.length_normalize(x) zw = embeddings.length_normalize(z) all_words = [] trans_words = [] trans_idx = [] oov = set() #if trans_src_fname is not None: if isinstance(trans_src_fname, str): with open(trans_src_fname, 'r', encoding='utf-8', errors='surrogateescape') as trans_src_file: for line in trans_src_file: try: #w=line.strip().lower() w = line.strip() all_words.append(w) w_ind = src_word2ind[w] trans_words.append(w) trans_idx.append(w_ind) except KeyError: oov.add(w) elif isinstance(trans_src_fname, list): for w in trans_src_fname: try: all_words.append(w) w_ind = src_word2ind[w] trans_words.append(w) trans_idx.append(w_ind) except KeyError: oov.add(w) else: all_words = src_words trans_words = src_words trans_idx = list(range(len(src_words))) oov = set() print(len(all_words)) print(len(trans_words)) print(len(trans_idx)) print(len(oov)) src = trans_idx translation = collections.defaultdict(int) translation5 = collections.defaultdict(list) translation10 = collections.defaultdict(list) if retrieval_method == 'nn': # Standard nearest neighbor for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) nn = similarities.argmax(axis=1).tolist() similarities_idx = similarities.argsort(axis=1) nn5 = similarities_idx[:, -5:] nn10 = similarities_idx[:, -10:] for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] elif retrieval_method == 'csls': t = time.time() nbrhood_x = np.zeros(xw.shape[0]) nbrhood_z = np.zeros(zw.shape[0]) nbrhood_z2 = cp.zeros(zw.shape[0]) print('Computing X Neighbourhood') # batch_size=1000 for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) # similarities_x = np.sort(similarities, axis=1) similarities_x = -1 * np.partition( -1 * similarities, csls_k - 1, axis=1) #similarities_x = -1*cp.partition(-1*cp.dot(cp.asarray(xw[src[i:j]]),cp.transpose(cp.asarray(zw))),csls_k-1 ,axis=1)[:,:csls_k] nbrhood_x[src[i:j]] = np.mean(similarities_x[:, :csls_k], axis=1) print('Completed in {0} seconds'.format(time.time() - t)) print('Computing Z Neighbourhood') batch_num = 1 for i in range(0, zw.shape[0], batch_size): j = min(i + batch_size, zw.shape[0]) similarities = -1 * cp.partition( -1 * cp.dot(cp.asarray(zw[i:j]), cp.transpose(cp.asarray(xw))), csls_k - 1, axis=1)[:, :csls_k] nbrhood_z2[i:j] = (cp.mean(similarities[:, :csls_k], axis=1)) print('Completed batch {0} in {1}'.format(batch_num, time.time() - t)) batch_num += 1 # gc.collect() # t=time.time() nbrhood_z = cp.asnumpy(nbrhood_z2) # ipdb.set_trace() print(time.time() - t) csls_alpha = 1 for i in range(0, len(src), batch_size): j = min(i + batch_size, len(src)) similarities = xw[src[i:j]].dot(zw.T) similarities = np.transpose( np.transpose(2 * similarities) - csls_alpha * nbrhood_x[src[i:j]]) - csls_alpha * nbrhood_z nn = similarities.argmax(axis=1).tolist() print(time.time() - t) similarities = np.argsort((similarities), axis=1) nn5 = (similarities[:, -5:]) nn10 = (similarities[:, -10:]) for k in range(j - i): translation[src[i + k]] = nn[k] translation5[src[i + k]] = nn5[k] translation10[src[i + k]] = nn10[k] print('Completed in {0} seconds'.format(time.time() - t)) # get translations trans_pairs = [] for w in trans_words: trans = '' if w in src_word2ind: trans = tgt_words[translation[src_word2ind[w]]] if len(trans) > 0 or trans_tgt_fname is not None: ### include blank lines only in the case of writing output to file trans_pairs.append((w, trans)) ### write the translations (1 pair per line format) if trans_tgt_fname is not None: with open(trans_tgt_fname, 'w', encoding='utf-8', errors='surrogateescape') as trans_tgt_file: for w, trans in trans_pairs: trans_tgt_file.write('{}\t{}\n'.format(w, trans)) else: return dict(trans_pairs)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Evaluate embeddings in word analogy') parser.add_argument('--src_embeddings', help='the word embeddings for source (left side)') parser.add_argument('--trg_embeddings', help='the word embeddings for target (right side)') parser.add_argument('-t', '--threshold', type=int, default=0, help='reduce vocabulary of the model for fast approximate evaluation (0 = off, otherwise typical value is 30,000)') parser.add_argument('-i', '--input', default=sys.stdin.fileno(), help='the test file (defaults to stdin)') parser.add_argument('-v', '--verbose', action='store_true', help='verbose output (give category specific results)') parser.add_argument('-l1', '--src_lowercase', action='store_true', help='lowercase the words in the test file') parser.add_argument('-l2', '--trg_lowercase', action='store_true', help='lowercase the words in the test file') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') args = parser.parse_args() # Choose the right dtype for the desired precision if args.precision == 'fp16': dtype = 'float16' elif args.precision == 'fp32': dtype = 'float32' elif args.precision == 'fp64': dtype = 'float64' # Read input embeddings f = open(args.src_embeddings, encoding=args.encoding, errors='surrogateescape') src_words, src_matrix = embeddings.read(f, threshold=args.threshold, dtype=dtype) f.close() f = open(args.trg_embeddings, encoding=args.encoding, errors='surrogateescape') trg_words, trg_matrix = embeddings.read(f, threshold=args.threshold, dtype=dtype) f.close() # Build word to index map src_word2ind = {word: i for i, word in enumerate(src_words)} trg_word2ind = {word: i for i, word in enumerate(trg_words)} src_ind2word = {i: word for i, word in enumerate(src_words)} trg_ind2word = {i: word for i, word in enumerate(trg_words)} # Length normalize embeddings embeddings.length_normalize(src_matrix) embeddings.length_normalize(trg_matrix) # Parse test file # c-a+b ~ d f = open(args.input, encoding=args.encoding, errors='surrogateescape') categories = [] a = [] #src lang b = [] #src lang c = [] #trg lang d = [] #trg lang linecounter = 0 for line in f: if line.startswith(': '): name = line[2:-1] is_syntactic = name.startswith('gram') categories.append({'name': name, 'is_syntactic': is_syntactic, 'total': 0, 'oov': 0}) else: try: words = line.split() #ind = [word2ind[word.lower() if args.lowercase else word] for word in line.split()] w0 = src_word2ind[words[0].lower() if args.src_lowercase else words[0]] w1 = src_word2ind[words[1].lower() if args.src_lowercase else words[1]] w2 = trg_word2ind[words[2].lower() if args.trg_lowercase else words[2]] w3 = trg_word2ind[words[3].lower() if args.trg_lowercase else words[3]] a.append(w0) b.append(w1) c.append(w2) d.append(w3) categories[-1]['total'] += 1 except KeyError: categories[-1]['oov'] += 1 total = len(a) # Compute nearest neighbors using efficient matrix multiplication nn = [] for i in range(0, total, BATCH_SIZE): j = min(i + BATCH_SIZE, total) similarities = (trg_matrix[c[i:j]] - src_matrix[a[i:j]] + src_matrix[b[i:j]]).dot(trg_matrix.T) similarities[range(j-i), a[i:j]] = -1 similarities[range(j-i), b[i:j]] = -1 similarities[range(j-i), c[i:j]] = -1 nn += np.argmax(similarities, axis=1).tolist() nn = np.array(nn) # Compute and print accuracies semantic = {'correct': 0, 'total': 0, 'oov': 0} syntactic = {'correct': 0, 'total': 0, 'oov': 0} ind = 0 with open('crosslingual_predict.txt', 'w') as outfile: for i in range(len(nn)): outfile.write(src_ind2word[a[i]]+' '+src_ind2word[b[i]]+' '+trg_ind2word[c[i]]+' '+trg_ind2word[d[i]]+' | '+trg_ind2word[nn[i]]+'\n') for category in categories: current = syntactic if category['is_syntactic'] else semantic correct = np.sum(nn[ind:ind+category['total']] == d[ind:ind+category['total']]) current['correct'] += correct current['total'] += category['total'] current['oov'] += category['oov'] ind += category['total'] if args.verbose: print('Coverage:{0:7.2%} Accuracy:{1:7.2%} | {2}'.format( category['total'] / (category['total'] + category['oov']), correct / category['total'], category['name'])) if args.verbose: print('-'*80) print('Coverage:{0:7.2%} Accuracy:{1:7.2%} (sem:{2:7.2%}, syn:{3:7.2%})'.format( (semantic['total'] + syntactic['total']) / (semantic['total'] + syntactic['total'] + semantic['oov'] + syntactic['oov']), (semantic['correct'] + syntactic['correct']) / (semantic['total'] + syntactic['total']), semantic['correct'] / semantic['total'], syntactic['correct'] / syntactic['total']))