if 'pickle' in filename_wordsX:
        wordsX = IO.readPickledWords(filename_wordsX)
        wordsY = IO.readPickledWords(filename_wordsY)
    else:
        wordsX = IO.readWords(filename_wordsX)
        wordsY = IO.readWords(filename_wordsY)

    if filename_lexicon == 'None':  # we don't have a lexicon. assume identity.
        log(100, 'Using identity lexicon')
        lex = None
        gold_lex = dict()  #
        for w in wordsX.words:
            gold_lex[w] = [w]
        log(100,  gold_lex)
    else:
        lex = BilexiconUtil.readLexicon(filename_lexicon)
        (gold_lex, times) = BilexiconUtil.filterLexicon(lex, wordsX.words, wordsY.words)
        log(100, 'Done filtering gold lexicon')

    seed = []
    used_targets = set()
    for source_word in wordsX.words:                      # go over source
        if source_word in gold_lex:                       # check source in lexicon
            translations = gold_lex[source_word]          # get translations of source
            for translation in translations:              # then, append translations for non-translated sources.
                if translation in wordsY.words and translation not in used_targets:
                    seed.append((source_word, translation))
                    used_targets.add(translation)
                    print "%s,%s" % (source_word, translation)
                    break
        else:
Beispiel #2
0
    # cmd line args
    filename_wordsX = sys.argv[1]
    filename_wordsY = sys.argv[2]
    filename_seed = sys.argv[3]
    options = parseOptions()
    # read input files
    wordsX, wordsY, seed_list = readInput(options, filename_wordsX, filename_wordsY, filename_seed)
    N = len(wordsX.words)
    options.matchingFilename = "results/matching_N=%d_expid=%d_alpha=%2.2f_T=%d.txt" % (
        N,
        options.exp_id,
        options.alpha,
        options.T,
    )
    NSeed = len(seed_list.X)
    if options.filename_lexicon is not None:
        lex = BU.readLexicon(options.filename_lexicon)
        (gold_lex, times) = BU.filterLexicon(lex, wordsX.words[:-NSeed], wordsY.words[:-NSeed])
        options.gold_lex = gold_lex
        print "Gold lexicon contains", len(gold_lex), "pairs."
    else:
        options.gold_lex = None
        print colored("WARNING: No gold lexicon", "red")

    print >> sys.stderr, "==============#########=========="
    print >> sys.stderr, "Starting mCCA:"
    print >> sys.stderr, NSeed, "seed pairs:", zip(seed_list.X, seed_list.Y)
    (wordsX, wordsY, edge_cost, cost) = mcca(options, wordsX, wordsY, seed_list)
    log(0, "hamming distance:", perm.hamming(wordsX.words, wordsY.words))
    bell()