def toString(wordsX, wordsY, sorted_edge_cost, lex=None): N = len(sorted_edge_cost) s = '' for n in xrange(N): weight = sorted_edge_cost[n] source_word = wordsX[n] target_word = wordsY[n] if lex is not None: matched = BU.is_valid_match(lex, source_word, target_word) matched = "correct" if matched else " wrong " else: matched = source_word == target_word #common.log(200, '{},{},{},{:>6},{:>12}'.format(source_word, target_word, matched, weight, n)) #common.log(200, '{} - {:>12}) {:>12} {:>12} {:>6}'.format(matched, n, source_word, target_word, weight)) # s += '{} - {:>4}),{:>10},{:>10},{:>4}'.format(matched, n, source_word, target_word, weight) s += '{} - {:>10},{:>10}'.format(matched, source_word, target_word) s += '\n' return s
def find_matching(options, wordsX, wordsY): # finds a permutation pi that best matches Y to X # The optimization procedure works as follows: # suppose there are 2000 words to be matched, 100 seed words and step size is 100 # The seed is stored at the end (so, X[i, :] matches Y[i, :] for i > 2000] in all iterations # at each iteration t (starting at t=0): # 1. compute the CCA on the last 100 + 100*t entries # 2. compute the CCA representation of all words # 3. perform a matching on the first N=2000 words to get pi_t # 4. sort the first 2000 matches in descending order. # initially, assume that pi is ID N = len(wordsX.words) M = N - options.seed_length # The first M entries can be permuted. The rest are fixed GX = None GY = None options.cca_weights = None sorted_edge_cost = None fixed_point = False for t in range(0, options.T): options.t = t Nt = M - options.step_size * t # STEP 0: when the feature dimension is high, ICD the seed and project the rest if wordsX.isPickled(): wordsX.ICD_representation(Nt, options.eta) wordsY.ICD_representation(Nt, options.eta) # STEP 1: compute CCA model on the well matched portion of the matching (which includes the fixed seed) fixedX = wordsX.features[Nt:, :] fixedY = wordsY.features[Nt:, :] if options.useCCAWeights == 1 and sorted_edge_cost is not None: q = np.square(sorted_edge_cost[Nt:]) bandwidth = np.median(q) options.cca_weights = np.exp(-q / (2 * bandwidth)) # exp is useful when dist is used # if options.noise_level > 0: # fixedX += options.noise_level*common.randn(fixedX.shape) # fixedY += options.noise_level*common.randn(fixedY.shape) print >> sys.stderr, colored("CCA dimensions =", "green"), len(fixedX) cca_model = CU.learn(fixedX, fixedY, options) print >> sys.stderr, len(cca_model.p), "Top 10 correlation coefficients:", cca_model.p[:10] # STEP 2: compute CCA representation of all samples print >> sys.stderr, "norms", norm(wordsX.features), norm(wordsY.features) Z = CU.project(options, cca_model, wordsX.features, wordsY.features) print >> sys.stderr, "Z", norm(Z.X), norm(Z.Y) # STEP 3: compute weight matrix and run matching (approximate) algorithm if options.alpha > 0: GX = wordsX.materializeGraph() GY = wordsY.materializeGraph() print >> sys.stderr, colored("Computing matching weight matrix.", "green") W, U0, Z0 = MU.makeWeights(options, Z.X, Z.Y, GX, GY) print >> sys.stderr, "Matching." (cost, pi_t, edge_cost) = MU.exactMatch(W[:M, :M]) # STEP 4: sort the words, such that the best matches are at the end. # note that pi_t is of length M < N and that (sorted_edge_cost, I) = perm.sort(edge_cost, reverse=True) sorted_edge_cost = np.concatenate((sorted_edge_cost, np.zeros(N - M))) if perm.isID(pi_t): # the best permutation is the identity fixed_point = True else: wordsX.permuteFirstWords(I) wordsY.permuteFirstWords(pi_t[I]) # END OF ITERATION: output Matching print >> sys.stderr, "cost =", cost, "latent inner product = ", np.sum(Z.X.A * Z.Y.A) # MU.printMatching(wordsX.words[:M], wordsY.words[:M], sorted_edge_cost[:M], options.gold_lex) if options.gold_lex is not None: scores = BU.getScores(options.gold_lex, wordsX.words[:M], wordsY.words[:M], sorted_edge_cost[:M]) BU.outputScores(scores, options.title) print "---------- ", "iteration = ", (t + 1), "/", options.T, "----------" sys.stdout.flush() if fixed_point: break # either we reached the maximum number of iterations, or a fixed point log(100, "Stopped after, ", (t + 1), "iterations. Fixed point =", fixed_point) IO.writeString( options.matchingFilename, MU.toString(wordsX.words[:M], wordsY.words[:M], sorted_edge_cost[:M], options.gold_lex), ) if options.is_mock: log("Hamming distance:", perm.hamming(wordsX.words, wordsY.words)) return wordsX, wordsY, sorted_edge_cost, cost
if 'pickle' in filename_wordsX: wordsX = IO.readPickledWords(filename_wordsX) wordsY = IO.readPickledWords(filename_wordsY) else: wordsX = IO.readWords(filename_wordsX) wordsY = IO.readWords(filename_wordsY) if filename_lexicon == 'None': # we don't have a lexicon. assume identity. log(100, 'Using identity lexicon') lex = None gold_lex = dict() # for w in wordsX.words: gold_lex[w] = [w] log(100, gold_lex) else: lex = BilexiconUtil.readLexicon(filename_lexicon) (gold_lex, times) = BilexiconUtil.filterLexicon(lex, wordsX.words, wordsY.words) log(100, 'Done filtering gold lexicon') seed = [] used_targets = set() for source_word in wordsX.words: # go over source if source_word in gold_lex: # check source in lexicon translations = gold_lex[source_word] # get translations of source for translation in translations: # then, append translations for non-translated sources. if translation in wordsY.words and translation not in used_targets: seed.append((source_word, translation)) used_targets.add(translation) print "%s,%s" % (source_word, translation) break else:
# cmd line args filename_wordsX = sys.argv[1] filename_wordsY = sys.argv[2] filename_seed = sys.argv[3] options = parseOptions() # read input files wordsX, wordsY, seed_list = readInput(options, filename_wordsX, filename_wordsY, filename_seed) N = len(wordsX.words) options.matchingFilename = "results/matching_N=%d_expid=%d_alpha=%2.2f_T=%d.txt" % ( N, options.exp_id, options.alpha, options.T, ) NSeed = len(seed_list.X) if options.filename_lexicon is not None: lex = BU.readLexicon(options.filename_lexicon) (gold_lex, times) = BU.filterLexicon(lex, wordsX.words[:-NSeed], wordsY.words[:-NSeed]) options.gold_lex = gold_lex print "Gold lexicon contains", len(gold_lex), "pairs." else: options.gold_lex = None print colored("WARNING: No gold lexicon", "red") print >> sys.stderr, "==============#########==========" print >> sys.stderr, "Starting mCCA:" print >> sys.stderr, NSeed, "seed pairs:", zip(seed_list.X, seed_list.Y) (wordsX, wordsY, edge_cost, cost) = mcca(options, wordsX, wordsY, seed_list) log(0, "hamming distance:", perm.hamming(wordsX.words, wordsY.words)) bell()