def toString(wordsX, wordsY, sorted_edge_cost, lex=None):
    N = len(sorted_edge_cost)
    s = ''
    for n in xrange(N):
        weight = sorted_edge_cost[n]
        source_word = wordsX[n]
        target_word = wordsY[n]
        if lex is not None:
            matched = BU.is_valid_match(lex, source_word, target_word)
            matched = "correct" if matched else " wrong "
        else:
            matched = source_word == target_word
        #common.log(200, '{},{},{},{:>6},{:>12}'.format(source_word, target_word, matched, weight, n))
        #common.log(200, '{} - {:>12}) {:>12} {:>12} {:>6}'.format(matched, n, source_word, target_word, weight))
        # s += '{} - {:>4}),{:>10},{:>10},{:>4}'.format(matched, n, source_word, target_word, weight)
        s += '{} - {:>10},{:>10}'.format(matched, source_word, target_word)
        s += '\n'
    return s
Beispiel #2
0
def find_matching(options, wordsX, wordsY):
    # finds a permutation pi that best matches Y to X
    # The optimization procedure works as follows:
    # suppose there are 2000 words to be matched, 100 seed words and step size is 100
    # The seed is stored at the end (so, X[i, :] matches Y[i, :] for i > 2000] in all iterations
    # at each iteration t (starting at t=0):
    # 1. compute the CCA on the last 100 + 100*t entries
    # 2. compute the CCA representation of all words
    # 3. perform a matching on the first N=2000 words to get pi_t
    # 4. sort the first 2000 matches in descending order.

    # initially, assume that pi is ID
    N = len(wordsX.words)
    M = N - options.seed_length  # The first M entries can be permuted. The rest are fixed
    GX = None
    GY = None

    options.cca_weights = None
    sorted_edge_cost = None

    fixed_point = False
    for t in range(0, options.T):
        options.t = t
        Nt = M - options.step_size * t
        # STEP 0: when the feature dimension is high, ICD the seed and project the rest
        if wordsX.isPickled():
            wordsX.ICD_representation(Nt, options.eta)
            wordsY.ICD_representation(Nt, options.eta)

        # STEP 1: compute CCA model on the well matched portion of the matching (which includes the fixed seed)
        fixedX = wordsX.features[Nt:, :]
        fixedY = wordsY.features[Nt:, :]
        if options.useCCAWeights == 1 and sorted_edge_cost is not None:
            q = np.square(sorted_edge_cost[Nt:])
            bandwidth = np.median(q)
            options.cca_weights = np.exp(-q / (2 * bandwidth))  # exp is useful when dist is used
        # if options.noise_level > 0:
        #     fixedX += options.noise_level*common.randn(fixedX.shape)
        #     fixedY += options.noise_level*common.randn(fixedY.shape)

        print >> sys.stderr, colored("CCA dimensions =", "green"), len(fixedX)
        cca_model = CU.learn(fixedX, fixedY, options)
        print >> sys.stderr, len(cca_model.p), "Top 10 correlation coefficients:", cca_model.p[:10]
        # STEP 2: compute CCA representation of all samples
        print >> sys.stderr, "norms", norm(wordsX.features), norm(wordsY.features)
        Z = CU.project(options, cca_model, wordsX.features, wordsY.features)

        print >> sys.stderr, "Z", norm(Z.X), norm(Z.Y)

        # STEP 3: compute weight matrix and run matching (approximate) algorithm
        if options.alpha > 0:
            GX = wordsX.materializeGraph()
            GY = wordsY.materializeGraph()
        print >> sys.stderr, colored("Computing matching weight matrix.", "green")

        W, U0, Z0 = MU.makeWeights(options, Z.X, Z.Y, GX, GY)
        print >> sys.stderr, "Matching."
        (cost, pi_t, edge_cost) = MU.exactMatch(W[:M, :M])
        # STEP 4: sort the words, such that the best matches are at the end.
        # note that pi_t is of length M < N and that
        (sorted_edge_cost, I) = perm.sort(edge_cost, reverse=True)
        sorted_edge_cost = np.concatenate((sorted_edge_cost, np.zeros(N - M)))

        if perm.isID(pi_t):  # the best permutation is the identity
            fixed_point = True
        else:
            wordsX.permuteFirstWords(I)
            wordsY.permuteFirstWords(pi_t[I])
            # END OF ITERATION: output Matching
        print >> sys.stderr, "cost =", cost, "latent inner product = ", np.sum(Z.X.A * Z.Y.A)

        # MU.printMatching(wordsX.words[:M], wordsY.words[:M], sorted_edge_cost[:M], options.gold_lex)
        if options.gold_lex is not None:
            scores = BU.getScores(options.gold_lex, wordsX.words[:M], wordsY.words[:M], sorted_edge_cost[:M])
            BU.outputScores(scores, options.title)

        print "---------- ", "iteration = ", (t + 1), "/", options.T, "----------"
        sys.stdout.flush()
        if fixed_point:
            break

    # either we reached the maximum number of iterations, or a fixed point
    log(100, "Stopped after, ", (t + 1), "iterations. Fixed point =", fixed_point)
    IO.writeString(
        options.matchingFilename,
        MU.toString(wordsX.words[:M], wordsY.words[:M], sorted_edge_cost[:M], options.gold_lex),
    )
    if options.is_mock:
        log("Hamming distance:", perm.hamming(wordsX.words, wordsY.words))
    return wordsX, wordsY, sorted_edge_cost, cost
    if 'pickle' in filename_wordsX:
        wordsX = IO.readPickledWords(filename_wordsX)
        wordsY = IO.readPickledWords(filename_wordsY)
    else:
        wordsX = IO.readWords(filename_wordsX)
        wordsY = IO.readWords(filename_wordsY)

    if filename_lexicon == 'None':  # we don't have a lexicon. assume identity.
        log(100, 'Using identity lexicon')
        lex = None
        gold_lex = dict()  #
        for w in wordsX.words:
            gold_lex[w] = [w]
        log(100,  gold_lex)
    else:
        lex = BilexiconUtil.readLexicon(filename_lexicon)
        (gold_lex, times) = BilexiconUtil.filterLexicon(lex, wordsX.words, wordsY.words)
        log(100, 'Done filtering gold lexicon')

    seed = []
    used_targets = set()
    for source_word in wordsX.words:                      # go over source
        if source_word in gold_lex:                       # check source in lexicon
            translations = gold_lex[source_word]          # get translations of source
            for translation in translations:              # then, append translations for non-translated sources.
                if translation in wordsY.words and translation not in used_targets:
                    seed.append((source_word, translation))
                    used_targets.add(translation)
                    print "%s,%s" % (source_word, translation)
                    break
        else:
Beispiel #4
0
    # cmd line args
    filename_wordsX = sys.argv[1]
    filename_wordsY = sys.argv[2]
    filename_seed = sys.argv[3]
    options = parseOptions()
    # read input files
    wordsX, wordsY, seed_list = readInput(options, filename_wordsX, filename_wordsY, filename_seed)
    N = len(wordsX.words)
    options.matchingFilename = "results/matching_N=%d_expid=%d_alpha=%2.2f_T=%d.txt" % (
        N,
        options.exp_id,
        options.alpha,
        options.T,
    )
    NSeed = len(seed_list.X)
    if options.filename_lexicon is not None:
        lex = BU.readLexicon(options.filename_lexicon)
        (gold_lex, times) = BU.filterLexicon(lex, wordsX.words[:-NSeed], wordsY.words[:-NSeed])
        options.gold_lex = gold_lex
        print "Gold lexicon contains", len(gold_lex), "pairs."
    else:
        options.gold_lex = None
        print colored("WARNING: No gold lexicon", "red")

    print >> sys.stderr, "==============#########=========="
    print >> sys.stderr, "Starting mCCA:"
    print >> sys.stderr, NSeed, "seed pairs:", zip(seed_list.X, seed_list.Y)
    (wordsX, wordsY, edge_cost, cost) = mcca(options, wordsX, wordsY, seed_list)
    log(0, "hamming distance:", perm.hamming(wordsX.words, wordsY.words))
    bell()